Overview
Comment: | Added missing files. |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.65-multi-db |
Files: | files | file ages | folders |
SHA1: |
366547935e9b654834b37a1cf05cf02a |
User & Date: | matt on 2019-02-04 05:17:41 |
Other Links: | branch diff | manifest | tags |
Context
2019-02-04
| ||
23:07 | yada check-in: b28d552c97 user: matt tags: v1.65-multi-db | |
05:17 | Added missing files. check-in: 366547935e user: matt tags: v1.65-multi-db | |
2019-02-03
| ||
23:13 | Added db init, cleaned up inter-module deps a little more. check-in: 86a0fe717f user: matt tags: v1.65-multi-db | |
Changes
Added mtserve.scm version [0b98a50db4].
> > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 | ;; Copyright 2006-2017, Matthew Welland. ;; ;; This file is part of Megatest. ;; ;; Megatest is free software: you can redistribute it and/or modify ;; it under the terms of the GNU General Public License as published by ;; the Free Software Foundation, either version 3 of the License, or ;; (at your option) any later version. ;; ;; Megatest is distributed in the hope that it will be useful, ;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;; GNU General Public License for more details. ;; ;; You should have received a copy of the GNU General Public License ;; along with Megatest. If not, see <http://www.gnu.org/licenses/>. ;; ;; (include "common.scm") ;; (include "megatest-version.scm") ;; fake out readline usage of toplevel-command (define (toplevel-command . a) #f) (use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69 (prefix base64 base64:) readline apropos json directory-utils typed-records srfi-18 extras format (prefix pkts pkts:)) (declare (uses common)) (declare (uses megatest-version)) (declare (uses margs)) (declare (uses server)) ;; (declare (uses daemon)) (declare (uses db)) (import db) (declare (uses portlogger)) (import portlogger) (declare (uses nmsg-transport)) (import (prefix nmsg-transport nmsg:)) (include "common_records.scm") (include "key_records.scm") (include "db_records.scm") (include "run_records.scm") (include "megatest-fossil-hash.scm") (define *usage-log-file* #f) ;; put path to file for logging usage in this var in the ~/.megatestrc file (define *usage-use-seconds* #t) ;; for Epoc seconds in usage logging change this to #t in ~/.megatestrc file ;; load the ~/.megatestrc file, put (use trace)(trace-call-sites #t)(trace function-you-want-to-trace) in this file ;; (let ((debugcontrolf (conc (get-environment-variable "HOME") "/.mtserverrc"))) (if (file-exists? debugcontrolf) (load debugcontrolf))) ;; usage logging, careful with this, it is not designed to deal with all real world challenges! ;; (if (and *usage-log-file* (file-write-access? *usage-log-file*)) (with-output-to-file *usage-log-file* (lambda () (print (if *usage-use-seconds* (current-seconds) (time->string (seconds->local-time (current-seconds)) "%Yww%V.%w %H:%M:%S")) " " (current-user-name) " " (current-directory) " " "\"" (string-intersperse (argv) " ") "\"")) #:append)) ;; Disabled help items ;; -rollup : (currently disabled) fill run (set by :runname) with latest test(s) ;; from prior runs with same keys ;; -daemonize : fork into background and disconnect from stdin/out (define help (conc " Megatest, documentation at http://www.kiatoa.com/fossils/megatest version " megatest-version " license GPL, Copyright Matt Welland 2006-2017 Usage: mtserver [options] -h : this help -manual : show the Megatest user manual -version : print megatest version (currently " megatest-version ") Launching and managing runs -run : run all tests or as specified by -testpatt -server -|hostname : start the server (reduces contention on megatest.db), use - to automatically figure out hostname -log logfile : send stdout and stderr to logfile -list-servers : list the servers -kill-servers : kill all servers -repl : start a repl (useful for extending megatest) -ping run-id|host:port : ping server, exit with 0 if found -debug N|N,M,O... : enable debug 0-N or N and M and O ... Examples # Get test path, use '.' to get a single path or a specific path/file pattern megatest -test-files 'logs/*.log' -target ubuntu/n%/no% -runname w49% -testpatt test_mt% Called as " (string-intersperse (argv) " ") " Version " megatest-version ", built from " megatest-fossil-hash )) ;; -gui : start a gui interface ;; -config fname : override the runconfigs file with fname ;; process args (define remargs (args:get-args (argv) (list "-start-dir" "-server" "-port" "-log" ) (list "-h" "-help" "--help" "-manual" "-version" "-list-servers" "-kill-servers" "-v" ;; verbose 2, more than normal (normal is 1) "-q" ;; quiet 0, errors/warnings only "-diff-rep" ) args:arg-hash 0)) ;; Add args that use remargs here ;; (if (and (not (null? remargs)) (not (or (args:get-arg "-runstep") (args:get-arg "-envcap") (args:get-arg "-envdelta") ) )) (debug:print-error 0 *default-log-port* "Unrecognised arguments: " (string-intersperse (if (list? remargs) remargs (argv)) " "))) ;; before doing anything else change to the start-dir if provided ;; (if (args:get-arg "-start-dir") (if (file-exists? (args:get-arg "-start-dir")) (let ((fullpath (common:real-path (args:get-arg "-start-dir")))) (setenv "PWD" fullpath) (change-directory fullpath)) (begin (debug:print-error 0 *default-log-port* "non-existant start dir " (args:get-arg "-start-dir") " specified, exiting.") (exit 1)))) ;; immediately set MT_TARGET if -reqtarg or -target are available ;; (let ((targ (or (args:get-arg "-reqtarg")(args:get-arg "-target")))) (if targ (setenv "MT_TARGET" targ))) ;; The watchdog is to keep an eye on things like db sync etc. ;; ;; TODO: for multiple areas, we will have multiple watchdogs; and multiple threads to manage (define *watchdog* (make-thread (lambda () (handle-exceptions exn (begin (print-call-chain) (print " message: " ((condition-property-accessor 'exn 'message) exn))) (common:watchdog))) "Watchdog thread")) ;;====================================================================== ;; Strive for clean exit handling ;;====================================================================== (define (server-exit-procedure) (on-exit (lambda () ;; close the databases, ensure the pkt is removed! 0))) ;; Copied from the SDL2 examples. ;; ;; Schedule quit! to be automatically called when your program exits normally. (on-exit server-exit-procedure) ;; Install a custom exception handler that will call quit! and then ;; call the original exception handler. This ensures that quit! will ;; be called even if an unhandled exception reaches the top level. (current-exception-handler (let ((original-handler (current-exception-handler))) (lambda (exception) (server-exit-procedure) (original-handler exception)))) ;;(if (not (args:get-arg "-server")) ;; (thread-start! *watchdog*)) ;; if starting a server; wait till we get to running state before kicking off watchdog (let* ((no-watchdog-args '("-list-runs" "-testdata-csv" "-list-servers" "-server" "-list-disks" "-list-targets" "-show-runconfig" ;;"-list-db-targets" "-show-runconfig" "-show-config" "-show-cmdinfo" "-cleanup-db")) (no-watchdog-args-vals (filter (lambda (x) x) (map args:get-arg no-watchdog-args))) (start-watchdog (null? no-watchdog-args-vals))) ;;(BB> "no-watchdog-args="no-watchdog-args "no-watchdog-args-vals="no-watchdog-args-vals) (if start-watchdog (thread-start! *watchdog*))) ;; bracket open-output-file with code to make leading directory if it does not exist and handle exceptions (define (open-logfile logpath) (condition-case (let* ((log-dir (or (pathname-directory logpath) "."))) (if (not (directory-exists? log-dir)) (system (conc "mkdir -p " log-dir))) (open-output-file logpath)) (exn () (debug:print-error 0 *default-log-port* "Could not open log file for write: "logpath) (define *didsomething* #t) (exit 1)))) ;; this segment will run launch:setup only if -log is not set. This is fairly safe as servers are not ;; manually started and thus should never be started in a non-megatest area. Thus no need to handle situation ;; where (launch:setup) returns #f? ;; (if (or (args:get-arg "-log")(args:get-arg "-server")) ;; redirect the log always when a server (handle-exceptions exn (begin (print "ERROR: Failed to switch to log output. " ((condition-property-accessor 'exn 'message) exn)) ) (let* ((tl (or (args:get-arg "-log")(launch:setup))) ;; run launch:setup if -server, ensure we do NOT run launch:setup if -log specified (logf (or (args:get-arg "-log") ;; use -log unless we are a server, then craft a logfile name (conc tl "/logs/server-" (current-process-id) "-" (get-host-name) ".log"))) (oup (open-logfile logf))) (if (not (args:get-arg "-log")) (hash-table-set! args:arg-hash "-log" logf)) ;; fake out future queries of -log (debug:print-info 0 *default-log-port* "Sending log output to " logf) (set! *default-log-port* oup)))) (if (or (args:get-arg "-h") (args:get-arg "-help") (args:get-arg "--help")) (begin (print help) (exit))) (if (args:get-arg "-version") (begin (print (common:version-signature)) ;; (print megatest-version) (exit))) (define *didsomething* #f) |
Added util.scm version [8fad541d3c].
> > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 | ;;====================================================================== ;; Copyright 2018, Matthew Welland. ;; ;; This file is part of Megatest. ;; ;; Megatest is free software: you can redistribute it and/or modify ;; it under the terms of the GNU General Public License as published by ;; the Free Software Foundation, either version 3 of the License, or ;; (at your option) any later version. ;; ;; Megatest is distributed in the hope that it will be useful, ;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;; GNU General Public License for more details. ;; ;; You should have received a copy of the GNU General Public License ;; along with Megatest. If not, see <http://www.gnu.org/licenses/>. ;; ;;====================================================================== ;;====================================================================== ;; Tests ;;====================================================================== (declare (unit util)) (declare (uses common)) (module util * (import chicken scheme data-structures extras srfi-13 ports ) ;;====================================================================== ;; L O C K I N G M E C H A N I S M S ;;====================================================================== ;; faux-lock is deprecated. Please use simple-lock below ;; (define (common:faux-lock keyname #!key (wait-time 8)(allow-lock-steal #t)) (if (rmt:no-sync-get/default keyname #f) ;; do not be tempted to compare to pid. locking is a one-shot action, if already locked for this pid it doesn't actually count (if (> wait-time 0) (begin (thread-sleep! 1) (if (eq? wait-time 1) ;; only one second left, steal the lock (begin (debug:print-info 0 *default-log-port* "stealing lock for " keyname) (common:faux-unlock keyname force: #t))) (common:faux-lock keyname wait-time: (- wait-time 1))) #f) (begin (rmt:no-sync-set keyname (conc (current-process-id))) (equal? (conc (current-process-id)) (conc (rmt:no-sync-get/default keyname #f)))))) (define (common:faux-unlock keyname #!key (force #f)) (if (or force (equal? (conc (current-process-id)) (conc (rmt:no-sync-get/default keyname #f)))) (begin (if (rmt:no-sync-get/default keyname #f) (rmt:no-sync-del! keyname)) #t) #f)) ;; simple lock. improve and converge on this one. ;; (define (common:simple-lock keyname) (rmt:no-sync-get-lock keyname)) (define (common:run-a-command cmd #!key (with-vars #f)) (let* ((pre-cmd (dtests:get-pre-command)) (post-cmd (dtests:get-post-command)) (fullcmd (if (or pre-cmd post-cmd) (conc pre-cmd cmd post-cmd) (conc "viewscreen " cmd)))) (debug:print-info 02 *default-log-port* "Running command: " fullcmd) (if with-vars (common:without-vars cmd) (common:without-vars fullcmd "MT_.*")))) ;; ideally put all this info into the db, no need to preserve it across moving homehost ;; ;; return list of ;; ( reachable? cpuload update-time ) (define (common:get-host-info hostname) (let* ((loadinfo (rmt:get-latest-host-load hostname)) ;; if this host happens to have been recently used by a test reuse the load data (load (car loadinfo)) (load-sample-time (cdr loadinfo)) (load-sample-age (- (current-seconds) load-sample-time)) (loadinfo-timeout-seconds 6) ;; this was 20 seconds, seems way too lax. Switch to 6 seconds (host-last-update-timeout-seconds 4) (host-rec (hash-table-ref/default *host-loads* hostname #f)) ) (cond ((< load-sample-age loadinfo-timeout-seconds) (list #t load-sample-time load)) ((and host-rec (< (current-seconds) (+ (host-last-update host-rec) host-last-update-timeout-seconds))) (list #t (host-last-update host-rec) (host-last-cpuload host-rec ))) ((common:unix-ping hostname) (list #t (current-seconds) (alist-ref 'adj-core-load (common:get-normalized-cpu-load hostname)))) ;; this is cheaper than you might think. get-normalized-cpu-load is cached for up to 5 seconds (else (list #f 0 -1) ;; bad host, don't use! )))) (define (get-uname . params) (let* ((uname-res (process:cmd-run->list (conc "uname " (if (null? params) "-a" (car params))))) (uname #f)) (if (null? (car uname-res)) "unknown" (caar uname-res)))) (define (get-unix-df path) (let* ((df-results (process:cmd-run->list (conc "df " path))) (space-rx (regexp "([0-9]+)\\s+([0-9]+)%")) (freespc #f)) ;; (write df-results) (for-each (lambda (l) (let ((match (string-search space-rx l))) (if match (let ((newval (string->number (cadr match)))) (if (number? newval) (set! freespc newval)))))) (car df-results)) freespc)) ;; given path get free space, allows override in [setup] ;; with free-space-script /path/to/some/script.sh ;; (define (get-df path) (if (configf:lookup *configdat* "setup" "free-space-script") (with-input-from-pipe (conc (configf:lookup *configdat* "setup" "free-space-script") " " path) (lambda () (let ((res (read-line))) (if (string? res) (string->number res))))) (get-unix-df path))) (define (std-exit-procedure) (on-exit (lambda () 0)) ;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*) (let ((no-hurry (if *time-to-exit* ;; hurry up #f (begin (set! *time-to-exit* #t) #t)))) (debug:print-info 4 *default-log-port* "starting exit process, finalizing databases.") (if (and no-hurry (debug:debug-mode 18)) (rmt:print-db-stats)) (let ((th1 (make-thread (lambda () ;; thread for cleaning up, give it five seconds (if *dbstruct-db* (db:close-all *dbstruct-db*)) ;; one second allocated (if *task-db* (let ((db (cdr *task-db*))) (if (sqlite3:database? db) (begin (sqlite3:interrupt! db) (sqlite3:finalize! db #t) ;; (vector-set! *task-db* 0 #f) (set! *task-db* #f))))) (http-client#close-all-connections!) ;; (if (and *runremote* ;; (remote-conndat *runremote*)) ;; (begin ;; (http-client#close-all-connections!))) ;; for http-client (if (not (eq? *default-log-port* (current-error-port))) (close-output-port *default-log-port*)) (set! *default-log-port* (current-error-port))) "Cleanup db exit thread")) (th2 (make-thread (lambda () (debug:print 4 *default-log-port* "Attempting clean exit. Please be patient and wait a few seconds...") (if no-hurry (begin (thread-sleep! 5)) ;; give the clean up few seconds to do it's stuff (begin (thread-sleep! 2))) (debug:print 4 *default-log-port* " ... done") ) "clean exit"))) (thread-start! th1) (thread-start! th2) (thread-join! th1) ) ) 0) ;; from metadat lookup MEGATEST_VERSION ;; (define (common:get-last-run-version) ;; RADT => How does this work in send-receive function??; assume it is the value saved in some DB (rmt:get-var "MEGATEST_VERSION")) (define (common:set-last-run-version) (rmt:set-var "MEGATEST_VERSION" (common:version-signature))) ;;====================================================================== ;; D I S K S P A C E ;;====================================================================== (define (common:get-disk-space-used fpath) (with-input-from-pipe (conc "/usr/bin/du -s " fpath) read)) (define (common:check-space-in-dir dirpath required) (let* ((dbspace (if (directory? dirpath) (get-df dirpath) 0))) (list (> dbspace required) dbspace required dirpath))) ;; check space in dbdir and in megatest dir ;; returns: ok/not dbspace required-space ;; (define (common:check-db-dir-space) (let* ((required (string->number (or (configf:lookup *configdat* "setup" "dbdir-space-required") "100000"))) (dbdir (common:get-db-tmp-area)) ;; (db:get-dbdir)) (tdbspace (common:check-space-in-dir dbdir required)) (mdbspace (common:check-space-in-dir *toppath* required))) (sort (list tdbspace mdbspace) (lambda (a b) (< (cadr a)(cadr b)))))) ;; check available space in dbdir, exit if insufficient ;; (define (common:check-db-dir-and-exit-if-insufficient) (let* ((spacedat (car (common:check-db-dir-space))) ;; look only at worst for now (is-ok (car spacedat)) (dbspace (cadr spacedat)) (required (caddr spacedat)) (dbdir (cadddr spacedat))) (if (not is-ok) (begin (debug:print-error 0 *default-log-port* "Insufficient space in " dbdir ", require " required ", have " dbspace ", exiting now.") (exit 1))))) ;; paths is list of lists ((name path) ... ) ;; (define (common:get-disk-with-most-free-space disks minsize) (let ((best #f) (bestsize 0)) (for-each (lambda (disk-num) (let* ((dirpath (cadr (assoc disk-num disks))) (freespc (cond ((not (directory? dirpath)) (if (common:low-noise-print 300 "disks not a dir " disk-num) (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not a directory - ignoring it.")) -1) ((not (file-write-access? dirpath)) (if (common:low-noise-print 300 "disks not writeable " disk-num) (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not writeable - ignoring it.")) -1) ((not (eq? (string-ref dirpath 0) #\/)) (if (common:low-noise-print 300 "disks not a proper path " disk-num) (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not a fully qualified path - ignoring it.")) -1) (else (get-df dirpath))))) (if (> freespc bestsize) (begin (set! best (cons disk-num dirpath)) (set! bestsize freespc))))) (map car disks)) (if (and best (> bestsize minsize)) best #f))) ;; #f means no disk candidate found ;; see defstruct host at top of file. ;; host: reachable last-update last-used last-cpuload ;; (define (common:update-host-loads-table hosts-raw) (let* ((hosts (filter (lambda (x) (string-match (regexp "^\\S+$") x)) hosts-raw))) (for-each (lambda (hostname) (let* ((rec (let ((h (hash-table-ref/default *host-loads* hostname #f))) (if h h (let ((h (make-host))) (hash-table-set! *host-loads* hostname h) h)))) (host-info (common:get-host-info hostname)) (is-reachable (car host-info)) (last-reached-time (cadr host-info)) (load (caddr host-info))) (host-reachable-set! rec is-reachable) (host-last-update-set! rec last-reached-time) (host-last-cpuload-set! rec load))) hosts))) ;; go through the hosts from least recently used to most recently used, pick the first that meets the load criteral from the ;; [host-rules] section. ;; (define (common:get-least-loaded-host hosts-raw host-type configdat) (let* ((rdat (configf:lookup configdat "host-rules" host-type)) (rules (common:val->alist (or rdat "") convert: #t)) ;; maxnload, maxnjobs, maxjobrate (maxnload (common:alist-ref/default 'maxnload rules 1.5)) ;; max normalized load (maxnjobs (common:alist-ref/default 'maxnjobs rules 1.5)) ;; max normalized number of jobs (maxjobrate (common:alist-ref/default 'maxjobrate rules (/ 1 6))) ;; max rate of submitting jobs to a given host in jobs/second (hosts (filter (lambda (x) (string-match (regexp "^\\S+$") x)) hosts-raw)) ;; (best-host #f) (get-rec (lambda (hostname) ;; (print "get-rec hostname=" hostname) (let ((h (hash-table-ref/default *host-loads* hostname #f))) (if h h (let ((h (make-host))) (hash-table-set! *host-loads* hostname h) h))))) (best-load 99999) (curr-time (current-seconds)) (get-hosts-sorted (lambda (hosts) (sort hosts (lambda (a b) (let ((a-rec (get-rec a)) (b-rec (get-rec b))) ;; (print "a=" a " a-rec=" a-rec " host-last-used=" (host-last-used a-rec)) ;; (print "b=" b " b-rec=" b-rec " host-last-used=" (host-last-used b-rec)) (< (host-last-used a-rec) (host-last-used b-rec)))))))) (debug:print 0 *default-log-port* "INFO: hosts-sorted=" (get-hosts-sorted hosts)) (if (null? hosts) #f ;; no hosts to select from. All done and giving up now. (let ((hosts-sorted (get-hosts-sorted hosts))) (common:update-host-loads-table hosts) (let loop ((hostname (car hosts-sorted)) (tal (cdr hosts-sorted)) (best-host #f)) (let* ((rec (get-rec hostname)) (reachable (host-reachable rec)) (load (host-last-cpuload rec)) (last-used (host-last-used rec)) (delta (- curr-time last-used)) (job-rate (if (> delta 0) (/ 1 delta) 999)) ;; jobs per second (new-best (cond ((not reachable) (debug:print 0 *default-log-port* "Skipping host " hostname " as it cannot be reached.") best-host) ((and (< load maxnload) ;; load is acceptable (< job-rate maxjobrate)) ;; job rate is acceptable (set! best-load load) hostname) (else best-host)))) (debug:print 0 *default-log-port* "INFO: Trying host " hostname " with load " load ", last used " delta " seconds ago, with job-rate " job-rate " for running a test." ) (if new-best (begin ;; found a host, return it (debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate) (host-last-used-set! rec curr-time) new-best) (if (null? tal) #f (loop (car tal)(cdr tal) best-host))))))))) (define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)) (let* ((loadavg (common:get-cpu-load remote-host)) (numcpus (if (< 1 numcpus-in) ;; not possible (common:get-num-cpus remote-host) numcpus-in)) (maxload (max maxload-in 0.5)) ;; so maxload must be greater than 0.5 for now BUG - FIXME? (first (car loadavg)) (next (cadr loadavg)) (adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1 (loadjmp (- first next)) (adjwait (+ (random 10)(/ (- 1000 count) 10) waitdelay))) ;; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously (cond ((and (> first adjload) (> count 0)) (debug:print-info 0 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg "")) (thread-sleep! adjwait) (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)) ((and (> loadjmp numcpus) (> count 0)) (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to load jump " loadjmp " > numcpus " numcpus (if msg msg "")) (thread-sleep! adjwait) (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))))) (define (common:wait-for-homehost-load maxload msg) (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local. #f (common:get-homehost))) (hh (if hh-dat (car hh-dat) #f)) (numcpus (common:get-num-cpus hh))) (common:wait-for-normalized-load maxload msg hh))) (define (common:get-num-cpus remote-host) (let* ((actual-host (or remote-host (get-host-name)))) (or (common:get-cached-info actual-host "num-cpus" age: 86400) ;; hosts had better not be changing the number of cpus too often! (let* ((proc (lambda () (let loop ((numcpu 0) (inl (read-line))) (if (eof-object? inl) (begin (common:write-cached-info remote-host "num-cpus" numcpu) numcpu) (loop (if (string-match "^processor\\s+:\\s+\\d+$" inl) (+ numcpu 1) numcpu) (read-line)))))) (result (if remote-host (with-input-from-pipe (conc "ssh " remote-host " cat /proc/cpuinfo") proc) (with-input-from-file "/proc/cpuinfo" proc)))) (common:write-cached-info actual-host "num-cpus" result) result)))) ;; wait for normalized cpu load to drop below maxload ;; (define (common:wait-for-normalized-load maxload msg remote-host) (let ((num-cpus (common:get-num-cpus remote-host))) (common:wait-for-cpuload maxload num-cpus 15 msg: msg remote-host: remote-host))) ) |