Index: common-inc.scm ================================================================== --- common-inc.scm +++ common-inc.scm @@ -921,11 +921,11 @@ (define (common:get-normalized-cpu-load-raw remote-host) (let* ((actual-host (or remote-host (get-host-name)))) ;; #f is localhost (or (common:get-cached-info actual-host "normalized-load") (let ((data (if remote-host (with-input-from-pipe - (conc "ssh " remote-host " cat /proc/loadavg;cat /proc/cpuinfo;echo end") + (conc "ssh " remote-host " \"cat /proc/loadavg;cat /proc/cpuinfo;echo end\"") read-lines) (append (with-input-from-file "/proc/loadavg" read-lines) (with-input-from-file "/proc/cpuinfo" @@ -943,11 +943,11 @@ (tal (cdr data)) (loads #f) (proc-num 0) ;; processor includes threads (phys-num 0) ;; physical chip on motherboard (core-num 0)) ;; core - ;; (print hed ", " loads ", " proc-num ", " phys-num ", " core-num) + ;;; (print hed ", " loads ", " proc-num ", " phys-num ", " core-num) (if (null? tal) ;; have all our data, calculate normalized load and return result (let* ((act-proc (+ proc-num 1)) (act-phys (+ phys-num 1)) (act-core (+ core-num 1)) (adj-proc-load (/ (car loads) act-proc)) @@ -1096,43 +1096,15 @@ (debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate) (host-last-used-set! rec curr-time) new-best) (if (null? tal) #f (loop (car tal)(cdr tal) best-host))))))))) -(define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f)) - (let* ((loadavg (common:get-cpu-load remote-host)) - (numcpus (if (<= 1 numcpus-in) ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again - (common:get-num-cpus remote-host) - numcpus-in)) - (maxload (if force-maxload - maxload-in - (max maxload-in 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME? - (first (car loadavg)) - (next (cadr loadavg)) - (adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1 - (loadjmp (- first next)) - (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)(/ (- 1000 count) 10) waitdelay) (- first adjload) )) ))) ;; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously - (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload - ", load: " first ", adjload: " adjload ", loadjmp: " loadjmp) - (cond - ((and (> first adjload) - (> count 0)) - (debug:print-info 0 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg "")) - (thread-sleep! adjwait) - (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)) - ((and (> loadjmp numcpus) - (> count 0)) - (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to load jump " loadjmp " > numcpus " numcpus (if msg msg "")) - (thread-sleep! adjwait) - (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))))) - (define (common:wait-for-homehost-load maxload msg) (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local. #f (common:get-homehost))) - (hh (if hh-dat (car hh-dat) #f)) - (numcpus (common:get-num-cpus hh))) + (hh (if hh-dat (car hh-dat) #f))) (common:wait-for-normalized-load maxload msg hh))) (define (common:get-num-cpus remote-host) (let* ((actual-host (or remote-host (get-host-name)))) (or (common:get-cached-info actual-host "num-cpus" age: 86400) ;; hosts had better not be changing the number of cpus too often! @@ -1139,11 +1111,11 @@ (let* ((proc (lambda () (let loop ((numcpu 0) (inl (read-line))) (if (eof-object? inl) (begin - (common:write-cached-info remote-host "num-cpus" numcpu) + (common:write-cached-info actual-host "num-cpus" numcpu) numcpu) (loop (if (string-match "^processor\\s+:\\s+\\d+$" inl) (+ numcpu 1) numcpu) (read-line)))))) @@ -1159,10 +1131,39 @@ ;; (define (common:wait-for-normalized-load maxload msg remote-host) (let ((num-cpus (common:get-num-cpus remote-host))) (common:wait-for-cpuload maxload num-cpus 15 msg: msg remote-host: remote-host))) +;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load +;; +(define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f)) + (let* ((loadavg (common:get-cpu-load remote-host)) + (numcpus (if (<= 1 numcpus-in) ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again + (common:get-num-cpus remote-host) + numcpus-in)) + (maxload (if force-maxload + maxload-in + (max maxload-in 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME? + (first (car loadavg)) + (next (cadr loadavg)) + (adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1 + (loadjmp (- first next)) + (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)(/ (- 1000 count) 10) waitdelay) (- first adjload) )) )));; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously + (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload + ", load: " first ", adjload: " adjload ", loadjmp: " loadjmp) + (cond + ((and (> first adjload) + (> count 0)) + (debug:print-info 0 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg "")) + (thread-sleep! adjwait) + (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)) + ((and (> loadjmp numcpus) + (> count 0)) + (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to load jump " loadjmp " > numcpus " numcpus (if msg msg "")) + (thread-sleep! adjwait) + (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))))) + (define (get-uname . params) (let* ((uname-res (process:cmd-run->list (conc "uname " (if (null? params) "-a" (car params))))) (uname #f)) (if (null? (car uname-res)) "unknown" Index: runs-inc.scm ================================================================== --- runs-inc.scm +++ runs-inc.scm @@ -928,11 +928,10 @@ (non-completed (filter (lambda (x) ;; remove hed from not completed list, duh, of course it is not completed! (not (equal? x hed))) (runs:calc-not-completed prereqs-not-met))) (loop-list (list hed tal reg reruns)) ;; configure the load runner - (numcpus (common:get-num-cpus #f)) (maxload (string->number (or (configf:lookup *configdat* "jobtools" "maxload") "3.0"))) ;; use a non-number string to disable (maxhomehostload (string->number (or (configf:lookup *configdat* "jobtools" "maxhomehostload") "2.0"))) ;; use a non-number string to disable (waitdelay (string->number (or (configf:lookup *configdat* "jobtools" "waitdelay") "60")))) (debug:print-info 4 *default-log-port* "have-resources: " have-resources " prereqs-not-met: (" (string-intersperse @@ -1030,14 +1029,19 @@ (not (member 'exclusive testmode))))) ;; (hash-table-delete! *max-tries-hash* (db:test-make-full-name test-name item-path)) ;; we are going to reset all the counters for test retries by setting a new hash table ;; this means they will increment only when nothing can be run (set! *max-tries-hash* (make-hash-table)) + ;; well, first lets see if cpu load throttling is enabled. If so wait around until the ;; average cpu load is under the threshold before continuing - (if maxload ;; only gate if maxload is specified - (common:wait-for-cpuload maxload numcpus waitdelay)) + + ;; jobtools maxload is useful for where the full Megatest run is done on one machine + (if maxload ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized + (common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f)) + + ;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues (if maxhomehostload (common:wait-for-homehost-load maxhomehostload (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload))) (run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry) (runs:incremental-print-results run-id) Index: server-inc.scm ================================================================== --- server-inc.scm +++ server-inc.scm @@ -130,11 +130,12 @@ (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) (setenv "TARGETHOST" target-host))) (setenv "TARGETHOST_LOGF" logfile) (thread-sleep! (/ (random 5000) 1000)) ;; add about a random (up to 5 seconds) initial delay. It seems pretty common that many running tests request a server at the same time - (common:wait-for-normalized-load load-limit " delaying server start due to load" target-host) ;; do not try starting servers on an already overloaded machine, just wait forever + ;;(common:wait-for-normalized-load load-limit " delaying server start due to load" target-host) ;; do not try starting servers on an already overloaded machine, just wait forever + (common:wait-for-homehost-load load-limit (conc " delaying server start due to load on homehost. limit is " load-limit)) (system (conc "nbfake " cmdln)) (unsetenv "TARGETHOST_LOGF") (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) (thread-join! log-rotate) (pop-directory)))