2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
|
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
|
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
|
(if (common:low-noise-print 30 (conc (round effective-load) "-load-acceptable-" effective-host))
(debug:print 0 *default-log-port* "Load on " effective-host " is acceptable at effective normalized load of "
effective-normalized-load " continuing."))
(debug:print 0 *default-log-port* "Load on " effective-host ", "
first" could not be retrieved. Giving up and continuing."))))))
;;======================================================================
;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load
;;
;; (define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f)(num-tries 5))
;; (let* ((loadavg (common:get-cpu-load remote-host))
;; (numcpus (if (<= 1 numcpus-in) ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again
;; (common:get-num-cpus remote-host)
;; numcpus-in))
;; (maxload (if force-maxload
;; maxload-in
;; (if (number? maxload-in)
;; (max maxload-in 0.5)
;; 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME?
;; (first (car loadavg))
;; (next (cadr loadavg))
;; (adjmaxload (* maxload (max 1 numcpus))) ;; possible bug where
;; ;; numcpus (or could be
;; ;; maxload) is zero,
;; ;; crude fallback is to
;; ;; at least use 1
;; (loadjmp (- first (if (> next (* numcpus 0.7)) ;; could do something with average of first and next?
;; 0
;; next))) ;; we will force a conservative calculation any time next is large.
;; (first-next-avg (/ (+ first next) 2))
;; ;; add some randomness to the time to break any alignment
;; ;; where netbatch dumps many jobs to machines simultaneously
;; (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)
;; (/ (- 1000 count) 10)
;; waitdelay)
;; (- first adjmaxload) ))))
;; (load-jump-limit (configf:lookup-number *configdat* "setup" "load-jump-limit"))
;; ;; effective load accounts for load jumps, this should elminate all the first-next-avg, adjwait, load-jump-limit
;; ;; etc.
;; (effective-load (common:get-intercept first next))
;; (effective-host (or remote-host "localhost"))
;; (normalized-effective-load (/ effective-load numcpus))
;; (will-wait (> normalized-effective-load maxload)))
;;
;; ;; let's let the user know once in a long while that load checking
;; ;; is happening but not constantly report it
;; #;(if (common:low-noise-print 30 (conc "cpuload" (or remote-host "localhost"))) ;; (> (random 100) 75) ;; about 25% of the time
;; (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload
;; ", load: " first ", adjmaxload: " adjmaxload ", loadjmp: " loadjmp))
;;
;; (debug:print-info 1 *default-log-port*
;; "On host: " effective-host
;; ", effective load: " effective-load
;; ", numcpus: " numcpus
;; ", normalized effective load: " normalized-effective-load
;; )
;;
;; (cond
;; ;; bad data, try again to get the data
;; ((and (< first 0) ;; this indicates the loadavg data is bad - machine may not be reachable
;; (> num-tries 0))
;; (debug:print 0 *default-log-port* "WARNING: received bad data from get-cpu-load " first ", we'll sleep 10s and try " num-tries " more times.")
;; (thread-sleep! 10)
;; (common:wait-for-cpuload maxload-in numcpus-in waitdelay
;; count: count remote-host: remote-host force-maxload: force-maxload num-tries: (- num-tries 1)))
;; ;; need to wait for load to drop
;; ((and will-wait ;; (> first adjmaxload)
;; (> count 0))
;; (debug:print-info 0 *default-log-port*
;; "Delaying " 15 ;; adjwait
;; " seconds due to normalized effective load " normalized-effective-load ;; first
;; " exceeding max of " adjmaxload
;; " on server " (or remote-host (get-host-name))
;; " (normalized load-limit: " maxload ") " (if msg msg ""))
;; (thread-sleep! 15) ;; adjwait)
;; (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
;; ((and (> loadjmp (cond
;; (load-jump-limit load-jump-limit)
;; ((> numcpus 8)(/ numcpus 2))
;; ((> numcpus 4)(/ numcpus 1.2))
;; (else 0.5)))
;; (> count 0))
;; (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to possible load jump " loadjmp ". "
;; (if msg msg ""))
;; (thread-sleep! adjwait)
;; (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
;; (else
;; (if (> num-tries 0)
;; (if (common:low-noise-print 30 (conc (round first) "-load-acceptable-" (or remote-host "localhost")))
;; (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") " is acceptable at " first " continuing."))
;; (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") ", "first" could not be retrieved. Giving up and continuing."))))))
;;
(define (get-uname . params)
(let* ((uname-res (process:cmd-run->list (conc "uname " (if (null? params) "-a" (car params)))))
(uname #f))
(if (null? (car uname-res))
"unknown"
(caar uname-res))))
;; for reasons I don't understand multiple calls to real-path in parallel threads
;; must be protected by mutexes
;;
(define (common:real-path inpath)
;; (process:cmd-run-with-stderr->list "readlink" "-f" inpath)) ;; cmd . params)
;; (let-values
;; (((inp oup pid) (process "readlink" (list "-f" inpath))))
;; (with-input-from-port inp
;; (let loop ((inl (read-line))
;; (res #f))
;; (print "inl=" inl)
;; (if (eof-object? inl)
;; (begin
;; (close-input-port inp)
;; (close-output-port oup)
;; ;; (process-wait pid)
;; res)
;; (loop (read-line) inl))))))
(with-input-from-pipe (conc "readlink -f " inpath) read-line))
;;======================================================================
;; D I S K S P A C E
;;======================================================================
(define (common:get-disk-space-used fpath)
|
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
|
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
|
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
|
((equal? status "WARN") "orange")
((equal? status "KILLED") "orange")
((equal? status "KILLREQ") "purple")
((equal? status "RUNNING") "blue")
((equal? status "ABORT") "brown")
(else "black")))
;;======================================================================
;; N A N O M S G C L I E N T
;;======================================================================
;;
;;
;;
;; (define (common:send-dboard-main-changed)
;; (let* ((dashboard-ips (mddb:get-dashboards)))
;; (for-each
;; (lambda (ipadr)
;; (let* ((soc (common:open-nm-req (conc "tcp://" ipadr)))
;; (msg (conc "main " *toppath*))
;; (res (common:nm-send-receive-timeout soc msg)))
;; (if (not res) ;; couldn't reach that dashboard - remove it from db
;; (print "ERROR: couldn't reach dashboard " ipadr))
;; res))
;; dashboard-ips)))
;;
;;
;; ;;======================================================================
;; ;; D A S H B O A R D D B
;; ;;======================================================================
;;
;; (define (mddb:open-db)
;; (let* ((db (open-database (conc (get-environment-variable "HOME") "/.dashboard.db"))))
;; (set-busy-handler! db (busy-timeout 10000))
;; (for-each
;; (lambda (qry)
;; (exec (sql db qry)))
;; (list
;; "CREATE TABLE IF NOT EXISTS vars (id INTEGER PRIMARY KEY,key TEXT, val TEXT, CONSTRAINT varsconstraint UNIQUE (key));"
;; "CREATE TABLE IF NOT EXISTS dashboards (
;; id INTEGER PRIMARY KEY,
;; pid INTEGER,
;; username TEXT,
;; hostname TEXT,
;; ipaddr TEXT,
;; portnum INTEGER,
;; start_time TIMESTAMP DEFAULT (strftime('%s','now')),
;; CONSTRAINT hostport UNIQUE (hostname,portnum)
;; );"
;; ))
;; db))
;;
;; ;; register a dashboard
;; ;;
;; (define (mddb:register-dashboard port)
;; (let* ((pid (current-process-id))
;; (hostname (get-host-name))
;; (ipaddr (server:get-best-guess-address hostname))
;; (username (current-user-name)) ;; (car userinfo)))
;; (db (mddb:open-db)))
;; (print "Register monitor, pid: " pid ", hostname: " hostname ", port: " port ", username: " username)
;; (exec (sql db "INSERT OR REPLACE INTO dashboards (pid,username,hostname,ipaddr,portnum) VALUES (?,?,?,?,?);")
;; pid username hostname ipaddr port)
;; (close-database db)))
;;
;; ;; unregister a monitor
;; ;;
;; (define (mddb:unregister-dashboard host port)
;; (let* ((db (mddb:open-db)))
;; (print "Register unregister monitor, host:port=" host ":" port)
;; (exec (sql db "DELETE FROM dashboards WHERE hostname=? AND portnum=?;") host port)
;; (close-database db)))
;;
;; ;; get registered dashboards
;; ;;
;; (define (mddb:get-dashboards)
;; (let ((db (mddb:open-db)))
;; (query fetch-column
;; (sql db "SELECT ipaddr || ':' || portnum FROM dashboards;"))))
;;======================================================================
;; T E S T L A U N C H I N G P E R I T E M W I T H H O S T T Y P E S
;;======================================================================
;;
;; [hosts]
;; arm cubie01 cubie02
|