Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -408,13 +408,22 @@ (define (common:version-changed?) (not (equal? (common:get-last-run-version) (common:version-signature)))) + +;; From 1.70 to 1.80, db's are compatible. + (define (common:api-changed?) - (not (equal? (substring (->string megatest-version) 0 4) - (substring (conc (common:get-last-run-version)) 0 4)))) + (let* ( + (megatest-major-version (substring (->string megatest-version) 0 4)) + (run-major-version (substring (conc (common:get-last-run-version)) 0 4)) + ) + (and (not (equal? megatest-major-version "1.80")) + (not (equal? megatest-major-version megatest-run-version))) + ) +) ;;====================================================================== ;; Move me elsewhere ... ;; RADT => Why do we meed the version check here, this is called only if version misma ;; @@ -520,11 +529,11 @@ (define (common:rotate-logs) (let* ((all-files (make-hash-table)) (stats (make-hash-table)) (inc-stat (lambda (key) (hash-table-set! stats key (+ (hash-table-ref/default stats key 0) 1)))) - (max-allowed (string->number (or (configf:lookup *configdat* "setup" "max-logfiles") "300")))) ;; name -> age + (max-allowed (string->number (or (configf:lookup *configdat* "setup" "max-logfiles") "600")))) ;; name -> age (if (not (directory-exists? "logs"))(create-directory "logs")) (directory-fold (lambda (file rem) (handle-exceptions exn @@ -599,14 +608,15 @@ ;;====================================================================== ;; Force a megatest cleanup-db if version is changed and skip-version-check not specified ;; Do NOT check if not on homehost! ;; (define (common:exit-on-version-changed) - (if (common:on-homehost?) + (if (and *toppath* ;; do nothing if *toppath* not yet provided + (common:on-homehost?)) (if (common:api-changed?) (let* ((mtconf (conc (get-environment-variable "MT_RUN_AREA_HOME") "/megatest.config")) - (dbfile (conc (get-environment-variable "MT_RUN_AREA_HOME") "/megatest.db")) + (dbfile (conc (get-environment-variable "MT_RUN_AREA_HOME") ".megatest/main.db")) (read-only (not (file-write-access? dbfile))) (dbstruct (db:setup #t))) ;; (db:setup-db *dbstruct-dbs* *toppath* #f))) ;; #t))) (debug:print 0 *default-log-port* "WARNING: Version mismatch!\n" " expected: " (common:version-signature) "\n" @@ -626,14 +636,14 @@ (common:cleanup-db dbstruct))) ((not (common:file-exists? mtconf)) (debug:print 0 *default-log-port* " megatest.config does not exist in this area. Cannot proceed with megatest version migration.") (exit 1)) ((not (common:file-exists? dbfile)) - (debug:print 0 *default-log-port* " megatest.db does not exist in this area. Cannot proceed with megatest version migration.") + (debug:print 0 *default-log-port* " .megatest/main.db does not exist in this area. Cannot proceed with megatest version migration.") (exit 1)) ((not (eq? (current-user-id)(file-owner mtconf))) - (debug:print 0 *default-log-port* " You do not own megatest.db in this area. Cannot proceed with megatest version migration.") + (debug:print 0 *default-log-port* " You do not own .megatest/main.db in this area. Cannot proceed with megatest version migration.") (exit 1)) (read-only (debug:print 0 *default-log-port* " You have read-only access to this area. Cannot proceed with megatest version migration.") (exit 1)) (else @@ -710,19 +720,21 @@ "")))) (define (common:alist-ref/default key alist default) (or (alist-ref key alist) default)) -(define (common:low-noise-print waitval . keys) - (let* ((key (string-intersperse (map conc keys) "-" )) - (lasttime (hash-table-ref/default *common:denoise* key 0)) - (currtime (current-seconds))) - (if (> (- currtime lasttime) waitval) - (begin - (hash-table-set! *common:denoise* key currtime) - #t) - #f))) +;; moved into commonmod +;; +;; (define (common:low-noise-print waitval . keys) +;; (let* ((key (string-intersperse (map conc keys) "-" )) +;; (lasttime (hash-table-ref/default *common:denoise* key 0)) +;; (currtime (current-seconds))) +;; (if (> (- currtime lasttime) waitval) +;; (begin +;; (hash-table-set! *common:denoise* key currtime) +;; #t) +;; #f))) (define (common:get-megatest-exe) (or (getenv "MT_MEGATEST") "megatest")) (define (common:read-encoded-string instr) @@ -980,12 +992,13 @@ ;;====================================================================== ;; E X I T H A N D L I N G ;;====================================================================== (define (common:run-sync?) - (and (common:on-homehost?) - (args:get-arg "-server"))) + (and *toppath* ;; gate if called before *toppath* is set + (common:on-homehost?) + (args:get-arg "-server"))) (define (common:human-time) (time->string (seconds->local-time (current-seconds)) "%Y-%m-%d %H:%M:%S")) @@ -1989,10 +2002,17 @@ (host-last-used-set! rec curr-time) new-best) (if (null? tal) #f (loop (car tal)(cdr tal) best-host))))))))) (define (common:wait-for-homehost-load maxnormload msg) + (let loop ((start-time (current-seconds))) ;; we saw some instances of this being called before *toppath* was set. This might be an early setup race. This delay should help but it is impossible to test... + (if (not *toppath*) + (begin + (debug:print 0 *default-log-port* "ERROR: common:wait-for-homehost-load called before *toppath* set.") + (thread-sleep! 30) + (if (< (- (current-seconds) start-time) 300) + (loop start-time))))) (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local. #f (server:choose-server *toppath* 'homehost))) (hh (if hh-dat (car hh-dat) #f))) (common:wait-for-normalized-load maxnormload msg hh))) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -359,13 +359,15 @@ (define (db:cache-for-read-only source target #!key (use-last-update #f)) (if (and (hash-table-ref/default *global-db-store* target #f) (>= (file-modification-time target)(file-modification-time source))) (hash-table-ref *global-db-store* target) (let* ((toppath (launch:setup)) - (targ-db-last-mod (if (common:file-exists? target) - (file-modification-time target) - 0)) + (targ-db-last-mod (db:get-sqlite3-mod-time target)) +;; (if (common:file-exists? target) +;; BUG: This needs to include wal mode stuff .shm etc. +;; (file-modification-time target) +;; 0)) (cache-db (or (hash-table-ref/default *global-db-store* target #f) (db:open-megatest-db path: target))) (source-db (db:open-megatest-db path: source)) (curr-time (current-seconds)) (res '()) @@ -405,35 +407,54 @@ ;; use-last-update: #t))) ;; (thread-start! th1) ;; (apply proc cache-db params) ;; )))) - - - +(define (db:get-sqlite3-mod-time fname) + (let* ((wal-file (conc fname "-wal")) + (shm-file (conc fname "-shm")) + (get-mtime (lambda (f) + (if (and (file-exists? f) + (file-read-access? f)) + (file-modification-time f) + 0)))) + (max (get-mtime fname) + (get-mtime wal-file) + (get-mtime shm-file)))) + (define (db:all-db-sync dbstruct) (let* ((dbdat (db:open-db dbstruct #f db:initialize-main-db)) (data-synced 0) ;; count of changed records (tmp-area (common:get-db-tmp-area)) (dbfiles (glob (conc tmp-area"/.megatest/*.db"))) (sync-durations (make-hash-table)) (no-sync-db (db:open-no-sync-db))) (for-each - (lambda (file) + (lambda (file) ;; tmp db file (debug:print-info 3 *default-log-port* "file: " file) - (let* ((fname (conc (pathname-file file) ".db")) - (fulln (conc *toppath*"/.megatest/"fname)) - (time1 (if (file-exists? file) - (file-modification-time file) - (begin - (debug:print-info 2 *default-log-port* "Sync - I do not see file "file) - 1))) - (time2 (if (file-exists? fulln) - (file-modification-time fulln) - (begin - (debug:print-info 2 *default-log-port* "Sync - I do not see file "fulln) - 0))) + (let* ((fname (conc (pathname-file file) ".db")) ;; fname is tmp db file + (wal-file (conc fname "-wal")) + (shm-file (conc fname "-shm")) + (fulln (conc *toppath*"/.megatest/"fname)) ;; fulln is nfs db name + (wal-time (if (file-exists? wal-file) + (file-modification-time wal-file) + 0)) + (shm-time (if (file-exists? shm-file) + (file-modification-time shm-file) + 0)) + (time1 (db:get-sqlite3-mod-time file)) +;; (if (file-exists? file) ;; time1 is the max itime of the tmp db, -wal and -shm files. +;; (max (file-modification-time file) wal-time shm-time) +;; (begin +;; (debug:print-info 2 *default-log-port* "Sync - I do not see file "file) +;; 1))) + (time2 (db:get-sqlite3-mod-time fulln)) +;; (if (file-exists? fulln) ;; time2 is nfs file time +;; (file-modification-time fulln) +;; (begin +;; (debug:print-info 2 *default-log-port* "Sync - I do not see file "fulln) +;; 0))) (changed (> (- time1 time2) (+ (random 5) 1))) ;; it has been at some few seconds since last synced (changed10 (> (- time1 time2) 10)) ;; it has been at least ten seconds since sync'd (jfile-exists (file-exists? (conc file"-journal"))) ;; i.e. are we busy? (do-cp (cond ((not (file-exists? fulln)) ;; shouldn't happen, but this might recover @@ -484,12 +505,11 @@ (dest-area (if old2new tmp-area *toppath*)) (dbfiles (if old2new (glob (conc *toppath* "/.megatest/*.db")) (glob (conc tmp-area "/.megatest/*.db")))) (keys (db:get-keys dbstruct)) (sync-durations (make-hash-table))) - - (if killservers + (if (and killservers servers) (begin (for-each (lambda (server) (handle-exceptions exn @@ -501,10 +521,13 @@ (tasks:kill-server host pid))))) servers) (delete-file* (common:get-sync-lock-filepath)) ) ) + + (if (not dbfiles) + (debug:print-error 0 *default-log-port* "no dbfiles found in " (conc *toppath* "/.megatest")) (for-each (lambda (srcfile) (debug:print-info 3 *default-log-port* "file: " srcfile) (let* ((fname (conc (pathname-file srcfile) ".db")) (basename (pathname-file srcfile)) @@ -525,10 +548,11 @@ (changed ( < (- time2 time1) 6.0)) ;; dest db not updated within last 6 seconds (do-cp (cond ((not (file-exists? destfile)) ;; shouldn't happen, but this might recover (debug:print-info 2 *default-log-port* "File " destfile " not found. Copying "srcfile" to "destfile) + ;; TODO: Need to fix this for WAL mod. Can't just copy. (system (conc "/bin/mkdir -p " dest-directory)) (system (conc "/bin/cp " srcfile " " destfile)) #t) (changed ;; (and changed ;; (> (- (current-seconds) time1) 3)) ;; if file is changed and three seconds have passed. @@ -562,14 +586,17 @@ (debug:print-info 2 *default-log-port* "skipping delta sync. " srcfile " is up to date") ) ) ) dbfiles + ) ) data-synced ) ) + + ;; Sync all changed db's ;; (define (db:tmp->megatest.db-sync dbstruct run-id last-update) (let* ((subdbs (hash-table-values (dbr:dbstruct-subdbs dbstruct))) @@ -2306,12 +2333,12 @@ (sqlite3:for-each-row (lambda (run-id testname item-path state status) ;; id,run_id,testname,state,status,event_time,host,cpuload,diskfree,uname,rundir,item_path,run_duration,final_logf,comment (set! res (vector test-id run-id testname state status -1 "" -1 -1 "" "-" item-path -1 "-" "-"))) db - "SELECT run_id,testname,item_path,state,status FROM tests WHERE id=?;" - test-id))) + "SELECT run_id,testname,item_path,state,status FROM tests WHERE id=? and run_id=?;" + test-id run-id))) res)) ;; get a useful subset of the tests data (used in dashboard ;; use db:mintest-get-{id ,run_id,testname ...} ;; Index: dbfile.scm ================================================================== --- dbfile.scm +++ dbfile.scm @@ -328,19 +328,19 @@ (current-error-port) (lambda () (apply print params)))) (define (dbfile:cautious-open-database fname init-proc sync-mode journal-mode #!optional (tries-left 500)) - (let* ((busy-file (conc fname"-journal")) + (let* ((busy-file (conc fname "-journal")) (delay-time (* (- 51 tries-left) 1.1)) (write-access (file-write-access? fname)) (dir-access (file-write-access? (pathname-directory fname))) (retry (lambda () (thread-sleep! delay-time) (if (> tries-left 0) (dbfile:cautious-open-database fname init-proc - sync-mode: sync-mode journal-mode: journal-mode + sync-mode: sync-mode journal-mode (- tries-left 1)))))) (assert (>= tries-left 0) (conc "FATAL: too many attempts in dbfile:cautious-open-database of "fname", giving up.")) (if (and (file-write-access? fname) (file-exists? busy-file)) @@ -351,11 +351,11 @@ (thread-sleep! 1) (if (eq? tries-left 2) (begin (dbfile:print-err "INFO: forcing journal rollup "busy-file) (dbfile:brute-force-salvage-db fname))) - (dbfile:cautious-open-database fname init-proc sync-mode: sync-mode journal-mode: journal-mode (- tries-left 1))) + (dbfile:cautious-open-database fname init-proc sync-mode journal-mode (- tries-left 1))) (let* ((result (condition-case (if dir-access (dbfile:with-simple-file-lock (conc fname ".lock") @@ -513,11 +513,12 @@ (db:sync-touched dbstruct runid keys dbinit) (set! *db-sync-in-progress* #f) (delete-file* lock-file) #t) (begin - (dbfile:print-err "INFO: could not get lock for " from-db-file ", sync likely in progress.") + (if (common:low-noise-print 120 (conc "no lock "from-db-file)) + (dbfile:print-err "INFO: could not get lock for " from-db-file ", sync likely in progress.")) #f )))) ;; ;; Get a lock from the no-sync-db for the from-db, then delta sync the from-db to the to-db, otherwise return #f ;; ;; @@ -992,10 +993,11 @@ ;; (db:with-db dbstruct run-id sqlite3:exec "select blah fgrom blaz;") ;; r/w is a flag to indicate if the db is modified by this query #t = yes, #f = no ;; (define (db:with-db dbstruct run-id r/w proc . params) + (assert dbstruct "FATAL: db:with-db called with dbstruct "#f) (let* ((use-mutex (> *api-process-request-count* 25)) ;; risk of db corruption (have-struct (dbr:dbstruct? dbstruct)) (dbdat (if have-struct ;; this stuff just allows us to call with a db handle directly (db:open-db dbstruct run-id (dbfile:db-init-proc)) ;; (dbfile:get-subdb dbstruct run-id) #f)) @@ -1008,10 +1010,11 @@ (jfile (conc fname"-journal")) #;(subdb (if have-struct (dbfile:get-subdb dbstruct run-id) #f)) ) ;; was 25 + (assert (sqlite3:database? db) "FATAL: db:with-db, db is not a database, db="db", fname="fname) (if (file-exists? jfile) (begin (dbfile:print-err "INFO: "jfile" exists, delaying to reduce database load") (thread-sleep! 0.2))) (if (and use-mutex Index: dcommon.scm ================================================================== --- dcommon.scm +++ dcommon.scm @@ -27,10 +27,13 @@ (declare (unit dcommon)) (declare (uses gutils)) (declare (uses db)) +(declare (uses commonmod)) + +(import commonmod) ;; (declare (uses synchash)) (include "megatest-version.scm") (include "common_records.scm") (include "db_records.scm") @@ -635,11 +638,12 @@ (common:max (map cadr col-indices)))) (max-visible (max (- (dboard:tabdat-num-tests tabdat) 15) 3)) (max-col-vis (if (> max-col 10) 10 max-col)) (numrows 1) (numcols 1)) - (iup:attribute-set! stats-matrix "CLEARVALUE" "CONTENTS") + (if (common:low-noise-print 60 "runs-stats-update-clear") + (iup:attribute-set! stats-matrix "CLEARVALUE" "CONTENTS")) (iup:attribute-set! stats-matrix "NUMCOL" max-col ) (iup:attribute-set! stats-matrix "NUMLIN" (if (< max-row max-visible) max-visible max-row)) ;; min of 20 (iup:attribute-set! stats-matrix "NUMCOL_VISIBLE" max-col-vis) (iup:attribute-set! stats-matrix "NUMLIN_VISIBLE" (if (> max-row max-visible) max-visible max-row)) ;;(print "row-indices: " row-indices " col-indices: " col-indices) Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -153,11 +153,11 @@ (determine-proxy (constantly #f))) (debug:print-info 0 *default-log-port* "http-transport:try-start-server time=" (seconds->time-string (current-seconds)) " ipaddrsstr=" ipaddrstr " portnum=" portnum " config-hostname=" config-hostname) (handle-exceptions exn (begin - (print-error-message exn) + ;; (print-error-message exn) (if (< portnum 64000) (begin (debug:print 0 *default-log-port* "WARNING: attempt to start server failed. Trying again ...") (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (debug:print 5 *default-log-port* "exn=" (condition->list exn)) @@ -359,10 +359,11 @@ (begin (print-call-chain *default-log-port*) (debug:print-error 0 *default-log-port* " closing connection failed with error: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn)) (close-connection! api-dat) (close-idle-connections!) + (remote-conndat-set! runremote #f) #t)) #f))) (define (make-http-transport:server-dat)(make-vector 6)) @@ -393,11 +394,11 @@ ;; ;; connect ;; (define (http-transport:client-connect iface port server-id) - (debug:print-info 0 *default-log-port* "Connecting to client at "iface":"port", with server-id "server-id) + (debug:print-info 2 *default-log-port* "Connecting to server at "iface":"port", id "server-id) (let* ((api-url (conc "http://" iface ":" port "/api")) (api-uri (uri-reference (conc "http://" iface ":" port "/api"))) (api-req (make-request method: 'POST uri: api-uri)) (server-dat (vector iface port api-uri api-url api-req (current-seconds) server-id))) server-dat)) @@ -429,11 +430,11 @@ (set! sdat *server-info*) (mutex-unlock! *heartbeat-mutex*) (if (and sdat (not changed) (> (- (current-seconds) start-time) 2)) - (let* ((servinfodir (conc *toppath*"/.servinfo")) + (let* ((servinfodir (server:get-servinfo-dir *toppath*)) ;; (conc *toppath*"/.servinfo")) (ipaddr (car sdat)) (port (cadr sdat)) (servinf (conc servinfodir"/"ipaddr":"port))) (set! servinfofile servinf) (if (not (file-exists? servinfodir)) @@ -448,34 +449,23 @@ (lambda () (delete-file* servinf)) *on-exit-procs*)) ;; put data about this server into a simple flat file host.port (debug:print-info 0 *default-log-port* "Received server alive signature") - #;(common:save-pkt `((action . alive) - (T . server) - (pid . ,(current-process-id)) - (ipaddr . ,(car sdat)) - (port . ,(cadr sdat))) - *configdat* #t) sdat) (begin (debug:print-info 0 *default-log-port* "Still waiting, last-sdat=" last-sdat) (sleep 4) (if (> (- (current-seconds) start-time) 120) ;; been waiting for two minutes - (let* ((ipaddr (car sdat)) + (if sdat + (let* ((ipaddr (car sdat)) (port (cadr sdat)) - (servinf (conc *toppath*"/.servinfo/"ipaddr":"port))) - (debug:print-error 0 *default-log-port* "transport appears to have died, exiting server") - ;; (delete-file* servinf) ;; handled by on-exit, can be removed - #;(common:save-pkt `((action . died) - (T . server) - (pid . ,(current-process-id)) - (ipaddr . ,(car sdat)) - (port . ,(cadr sdat)) - (msg . "Transport died?")) - *configdat* #t) + (servinf (conc (server:get-servinfo-dir *toppath*)"/"ipaddr":"port))) + (debug:print-error 0 *default-log-port* "transport appears to have died, exiting server") (exit)) + (exit) + ) (loop start-time (equal? sdat last-sdat) sdat))))))) (iface (car server-info)) (port (cadr server-info)) @@ -504,11 +494,12 @@ (if (and no-sync-db (common:low-noise-print 10 "sync-all")) ;; cheesy way to reduce frequency of running sync :) (begin (if (common:low-noise-print 120 "sync-all-print") (debug:print 0 *default-log-port* "keep-running calling db:all-db-sync at " (time->string (seconds->local-time) "%H:%M:%S"))) - (db:all-db-sync *dbstruct-dbs*)))) + (db:all-db-sync *dbstruct-dbs*) + ))) ;; when things go wrong we don't want to be doing the various queries too often ;; so we strive to run this stuff only every four seconds or so. (let* ((sync-time (- (current-milliseconds) start-time)) (rem-time (quotient (- 4000 sync-time) 1000))) @@ -560,22 +551,22 @@ (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access)) (let ((curr-time (current-seconds))) (handle-exceptions exn (debug:print 0 *default-log-port* "ERROR: Failed to change timestamp on info file " servinfofile ". Are you out of space on that disk? exn=" exn) - (if (and (< (- (current-seconds) server-start-time) 600) ;; run for ten minutes for experiment, 3600 thereafter + (if (and ;; (< (- (current-seconds) server-start-time) 600) ;; run for ten minutes for experiment, 3600 thereafter (not *server-overloaded*) (file-exists? servinfofile)) (change-file-times servinfofile curr-time curr-time))) - (if (or (common:low-noise-print 120 "start new server") + (if (and (common:low-noise-print 120 "start new server") (> *api-process-request-count* 50)) ;; if this server is kind of busy start up another (begin - (debug:print-info 0 *default-log-port* "Server is busy, parallel-api-count "*api-process-request-count*", start another if possible...") + (debug:print-info 0 *default-log-port* "Server is busy, api-count "*api-process-request-count*", start another if possible...") (server:kind-run *toppath*) (if (> *api-process-request-count* 100) (begin - (debug:print-info 0 *default-log-port* "Server is overloaded at parallel-api-count="*api-process-request-count*", removing "servinfofile) + (debug:print-info 0 *default-log-port* "Server is overloaded at api-count=" *api-process-request-count*", removing "servinfofile) (delete-file* servinfofile))))))) (loop 0 server-state bad-sync-count (current-milliseconds))) (else (debug:print-info 0 *default-log-port* "Server timed out. seconds since last db access: " (- (current-seconds) last-access)) (http-transport:server-shutdown port))))))) @@ -621,59 +612,36 @@ ;; all routes though here end in exit ... ;; ;; start_server? ;; (define (http-transport:launch) - ;; check that a server start is in progress, pause or exit if so - (let* ((tmp-area (common:get-db-tmp-area)) - (server-start (conc tmp-area "/.server-start")) - (server-started (conc tmp-area "/.server-started")) - (start-time (common:lazy-modification-time server-start)) - (started-time (common:lazy-modification-time server-started)) - (server-starting (< start-time started-time)) ;; if start-time is less than started-time then a server is still starting - (start-time-old (> (- (current-seconds) start-time) 5)) - (cleanup-proc (lambda (msg) - (let* ((serv-fname (conc "server-" (current-process-id) "-" (get-host-name) ".log")) - (full-serv-fname (conc *toppath* "/logs/" serv-fname)) - (new-serv-fname (conc *toppath* "/logs/" "defunct-" serv-fname))) - (debug:print 0 *default-log-port* msg) - (if (common:file-exists? full-serv-fname) - (system (conc "sleep 1;mv -f " full-serv-fname " " new-serv-fname)) - (debug:print 0 *default-log-port* "INFO: cannot move " full-serv-fname " to " new-serv-fname)) - (exit))))) - #;(if (and (not start-time-old) ;; last server start try was less than five seconds ago - (not server-starting)) - (begin - (cleanup-proc "NOT starting server, there is either a recently started server or a server in process of starting") - (exit))) - ;; lets not even bother to start if there are already three or more server files ready to go - #;(let* ((num-alive (server:get-num-alive (server:get-list *toppath*)))) - (if (> num-alive 3) - (begin - (cleanup-proc (conc "ERROR: Aborting server start because there are already " num-alive " possible servers either running or starting up")) - (exit)))) - #;(common:save-pkt `((action . start) - (T . server) - (pid . ,(current-process-id))) - *configdat* #t) - (let* ((th2 (make-thread (lambda () - (debug:print-info 0 *default-log-port* "Server run thread started") - (http-transport:run - (if (args:get-arg "-server") - (args:get-arg "-server") - "-") - )) "Server run")) - (th3 (make-thread (lambda () - (debug:print-info 0 *default-log-port* "Server monitor thread started") - (http-transport:keep-running) - "Keep running")))) - (thread-start! th2) - (thread-sleep! 0.25) ;; give the server time to settle before starting the keep-running monitor. - (thread-start! th3) - (set! *didsomething* #t) - (thread-join! th2) - (exit)))) + ;; check the .servinfo directory, are there other servers running on this + ;; or another host? + (let* ((server-start-is-ok (server:minimal-check *toppath*))) + (if (not server-start-is-ok) + (begin + (debug:print 0 *default-log-port* "ERROR: server start not ok, exiting now.") + (exit 1)))) + + ;; check that a server start is in progress, pause or exit if so + (let* ((th2 (make-thread (lambda () + (debug:print-info 0 *default-log-port* "Server run thread started") + (http-transport:run + (if (args:get-arg "-server") + (args:get-arg "-server") + "-") + )) "Server run")) + (th3 (make-thread (lambda () + (debug:print-info 0 *default-log-port* "Server monitor thread started") + (http-transport:keep-running) + "Keep running")))) + (thread-start! th2) + (thread-sleep! 0.25) ;; give the server time to settle before starting the keep-running monitor. + (thread-start! th3) + (set! *didsomething* #t) + (thread-join! th2) + (exit))) ;; (define (http-transport:server-signal-handler signum) ;; (signal-mask! signum) ;; (handle-exceptions ;; exn Index: megatest-version.scm ================================================================== --- megatest-version.scm +++ megatest-version.scm @@ -18,6 +18,6 @@ ;; Always use two or four digit decimal ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00.. ;; (declare (unit megatest-version)) -(define megatest-version 1.7009) +(define megatest-version 1.8005) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -654,23 +654,10 @@ ;; for some switches always print the command to stderr ;; (if (args:any? "-run" "-runall" "-remove-runs" "-set-state-status" "-kill-runs" "-kill-rerun") (debug:print 0 *default-log-port* (string-intersperse (argv) " "))) -;; some switches imply homehost. Exit here if not on homehost -;; -(let ((homehost-required (list "-cleanup-db"))) - (if (apply args:any? homehost-required) - (if (not (server:choose-server *toppath* 'home?)) - (for-each - (lambda (switch) - (if (args:get-arg switch) - (begin - (debug:print 0 *default-log-port* "ERROR: you must be on the homehost to run with " switch - ", you can move homehost by removing the .homehost file but this will disrupt any runs in progress.") - (exit 1)))) - homehost-required)))) ;;====================================================================== ;; Misc setup stuff ;;====================================================================== @@ -934,13 +921,13 @@ ;;====================================================================== ;; Server? Start up here. ;; (if (args:get-arg "-server") - (let ((tl (launch:setup)) - (transport-type (string->symbol (or (args:get-arg "-transport") "http")))) - (server:launch 0 transport-type) + (let ((tl (launch:setup))) + ;; (server:launch 0 'http) + (http-transport:launch) (set! *didsomething* #t))) ;; The adjutant is a bit different, it does NOT run (launch:setup) as it is not necessarily tied to ;; a specific Megatest area. Detail are being hashed out and this may change. ;; @@ -2321,10 +2308,14 @@ (begin (if (not (launch:setup)) (begin (debug:print 0 *default-log-port* "Failed to setup, exiting") (exit 1))) + (if (not (server:choose-server *toppath* 'home?)) + (begin + (debug:print 0 *default-log-port* "Servers are not running on this host or no servers alive. Cannot run cleanup-db") + (exit 1))) (let ((dbstructs (db:setup #f))) (common:cleanup-db dbstructs)) (set! *didsomething* #t))) (if (args:get-arg "-mark-incompletes") ADDED mtargs/mtargs.egg Index: mtargs/mtargs.egg ================================================================== --- /dev/null +++ mtargs/mtargs.egg @@ -0,0 +1,7 @@ +((license "LGPL") + (version 0.1) + (category misc) + (dependencies srfi-69 srfi-1) + (author "Matt Welland") + (synopsis "Primitive argument processor.") + (components (extension mtargs))) Index: mtargs/mtargs.scm ================================================================== --- mtargs/mtargs.scm +++ mtargs/mtargs.scm @@ -19,22 +19,28 @@ (module mtargs ( arg-hash get-arg get-arg-from - usage get-args + usage print-args any-defined? - help ) -(import scheme chicken data-structures extras posix ports files) -(use srfi-69 srfi-1) +(import scheme) ;; gives us cond-expand in chicken-4 + +(cond-expand + (chicken-5 + (import scheme (chicken base) (chicken port) (chicken file) (chicken process-context)) + (import srfi-69 srfi-1)) + (chicken-4 + (import chicken posix srfi-69 srfi-1)) + (else)) +(define usage (make-parameter print)) (define arg-hash (make-hash-table)) -(define help "") (define (get-arg arg . default) (if (null? default) (hash-table-ref/default arg-hash arg #f) (hash-table-ref/default arg-hash arg (car default)))) @@ -48,28 +54,10 @@ (define (get-arg-from ht arg . default) (if (null? default) (hash-table-ref/default ht arg #f) (hash-table-ref/default ht arg (car default)))) -(define (usage . args) - (if (> (length args) 0) - (apply print "ERROR: " args)) - (if (string? help) - (print help) - (print "Usage: " (car (argv)) " ... ")) - (exit 0)) - - ;; one-of args defined -(define (any-defined? . param) - (let ((res #f)) - (for-each - (lambda (arg) - (if (get-arg arg)(set! res #t))) - param) - res)) - -;; args: (define (get-args args params switches arg-hash num-needed) (let* ((numtargs (length args)) (adj-num-needed (if num-needed (+ num-needed 2) #f))) (if (< numtargs (if adj-num-needed adj-num-needed 2)) (if (>= num-needed 1) @@ -94,13 +82,12 @@ (else (if (null? tail)(append remtargs (list arg)) ;; return the non-used args (loop (car tail)(cdr tail)(append remtargs (list arg)))))))) )) -(define (print-args remtargs arg-hash) - (print "ARGS: " remtargs) +(define (print-args arg-hash) (for-each (lambda (arg) (print " " arg " " (hash-table-ref/default arg-hash arg #f))) (hash-table-keys arg-hash))) ) Index: portlogger.scm ================================================================== --- portlogger.scm +++ portlogger.scm @@ -64,11 +64,11 @@ exn (begin ;; (release-dot-lock fname) (debug:print-error 0 *default-log-port* "portlogger:open-run-close failed. " proc " " params) (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) - (debug:print 5 *default-log-port* "exn=" (condition->list exn)) + ;; (debug:print 5 *default-log-port* "exn=" (condition->list exn)) (if (common:file-exists? fname)(delete-file fname)) ;; brutally get rid of it (print-call-chain (current-error-port))) (let* (;; (lock (obtain-dot-lock fname 2 9 10)) (db (portlogger:open-db fname)) (res (apply proc db params))) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -175,12 +175,13 @@ (remote-conndat runremote) (> (current-seconds) ;; if it has been more than server-timeout seconds since last contact, close this connection and start a new on (+ (http-transport:server-dat-get-last-access (remote-conndat runremote)) (remote-server-timeout runremote)))) (debug:print-info 0 *default-log-port* "Connection to " (remote-server-url runremote) " expired due to no accesses, forcing new connection.") - (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection. (http-transport:close-connections area-dat: runremote) + ;; moving this setting of runremote conndat to #f to inside the http-transport:close-connections + ;; (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection. (mutex-unlock! *rmt-mutex*) (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;;DOT CASE5 [label="local\nread"]; ;;DOT MUTEXLOCK -> CASE5 [label="server not required,\non homehost,\nread-only query"]; {rank=same "case 5" CASE5}; Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -1775,11 +1775,11 @@ (last-jobs-check-time (runs:dat-last-jobs-check-time runsdat)) (should-check-jobs (match can-run-more-tests ((can-run-more-flag num-running nr-in-jobgroup max-concurrent-jobs . params) (if (< (- max-concurrent-jobs num-running) 25) (begin - (debug:print-info 0 *default-log-port* + (debug:print-info 2 *default-log-port* "less than 20 jobs headroom, ("max-concurrent-jobs "-"num-running")>20. Forcing prelaunch check.") #t) #f)) (else #f)))) ;; no record yet Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -65,21 +65,10 @@ ;;====================================================================== ;; Call this to start the actual server ;; -;; all routes though here end in exit ... -;; -;; start_server -;; -(define (server:launch run-id transport-type) - (case transport-type - ((http)(http-transport:launch)) - ;;((nmsg)(nmsg-transport:launch run-id)) - ;;((rpc) (rpc-transport:launch run-id)) - (else (debug:print-error 0 *default-log-port* "unknown server type " transport-type)))) - ;;====================================================================== ;; S E R V E R U T I L I T I E S ;;====================================================================== ;; Get the transport @@ -133,18 +122,11 @@ ;; if the target-host is set ;; try running on that host ;; incidental: rotate logs in logs/ dir. ;; (define (server:run areapath) ;; areapath is *toppath* for a given testsuite area - (let* (;; (curr-host (get-host-name)) - ;; (attempt-in-progress (server:start-attempted? areapath)) - ;; (dot-server-url (server:check-if-running areapath)) - ;; (curr-ip (server:get-best-guess-address curr-host)) - ;; (curr-pid (current-process-id)) - ;; (homehost (server:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" )) - ;; (target-host (car homehost)) - (testsuite (common:get-testsuite-name)) + (let* ((testsuite (common:get-testsuite-name)) (logfile (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log")) (profile-mode (or (configf:lookup *configdat* "misc" "profilesw") "")) (cmdln (conc (common:get-megatest-exe) " -server - ";; (or target-host "-") @@ -190,46 +172,48 @@ (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+)")) ;; SERVER STARTED: host:port AT timesecs server id (dbprep-rx (regexp "^SERVER: dbprep")) (dbprep-found 0) (bad-dat (list #f #f #f #f #f))) (handle-exceptions - exn - (begin - (debug:print-info 0 *default-log-port* "Unable to get server info from " logf ", exn=" exn) - bad-dat) ;; no idea what went wrong, call it a bad server - (with-input-from-file - logf - (lambda () - (let loop ((inl (read-line)) - (lnum 0)) - (if (not (eof-object? inl)) - (let ((mlst (string-match server-rx inl)) - (dbprep (string-match dbprep-rx inl))) - (if dbprep (set! dbprep-found 1)) - (if (not mlst) - (if (< lnum 500) ;; give up if more than 500 lines of server log read - (loop (read-line)(+ lnum 1)) - (begin + exn + (begin + ;; WARNING: this is potentially dangerous to blanket ignore the errors + (if (file-exists? logf) + (debug:print-info 2 *default-log-port* "Unable to get server info from "logf", exn=" exn)) + bad-dat) ;; no idea what went wrong, call it a bad server + (with-input-from-file + logf + (lambda () + (let loop ((inl (read-line)) + (lnum 0)) + (if (not (eof-object? inl)) + (let ((mlst (string-match server-rx inl)) + (dbprep (string-match dbprep-rx inl))) + (if dbprep (set! dbprep-found 1)) + (if (not mlst) + (if (< lnum 500) ;; give up if more than 500 lines of server log read + (loop (read-line)(+ lnum 1)) + (begin (debug:print-info 0 *default-log-port* "Unable to get server info from first 500 lines of " logf ) bad-dat)) - (match mlst - ((_ host port start server-id pid) - (list host - (string->number port) - (string->number start) - server-id - (string->number pid))) - (else - (debug:print 0 *current-log-port* "ERROR: did not recognise SERVER line info "mlst) - bad-dat)))) - (begin - (if dbprep-found - (begin - (debug:print-info 2 *default-log-port* "Server is in dbprep at " (common:human-time)) - (thread-sleep! 0.5)) ;; was 25 sec but that blocked things from starting? - (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds)))) - bad-dat)))))))) + (match mlst + ((_ host port start server-id pid) + (list host + (string->number port) + (string->number start) + server-id + (string->number pid))) + (else + (debug:print 0 *current-log-port* "ERROR: did not recognise SERVER line info "mlst) + bad-dat)))) + (begin + (if dbprep-found + (begin + (debug:print-info 2 *default-log-port* "Server is in dbprep at " (common:human-time)) + (thread-sleep! 0.5)) ;; was 25 sec but that blocked things from starting? + (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds)))) + bad-dat)))))))) ;; ;; get a list of servers from the log files, with all relevant data ;; ;; ( mod-time host port start-time pid ) ;; ;; ;; (define (server:get-list areapath #!key (limit #f)) @@ -419,11 +403,12 @@ ;; oldest server alive determines host then choose random of youngest ;; five servers on that host ;; (define (server:get-servers-info areapath) - (let* ((servinfodir (conc *toppath*"/.servinfo"))) + ;; (assert *toppath* "FATAL: server:get-servers-info called before *toppath* has been set.") + (let* ((servinfodir (server:get-servinfo-dir areapath))) ;; (conc *toppath*"/.servinfo"))) (if (not (file-exists? servinfodir)) (create-directory servinfodir)) (let* ((allfiles (glob (conc servinfodir"/*"))) (res (make-hash-table))) (for-each @@ -432,15 +417,45 @@ (serverdat (server:logf-get-start-info f))) (match serverdat ((host port start server-id pid) (if (and host port start server-id pid) (hash-table-set! res hostport serverdat) - (debug:print-info 0 *default-log-port* "bad server info for "f": "serverdat))) + (debug:print-info 2 *default-log-port* "bad server info for "f": "serverdat))) (else - (debug:print-info 0 *default-log-port* "bad server info for "f": "serverdat))))) + (debug:print-info 2 *default-log-port* "bad server info for "f": "serverdat))))) allfiles) res))) + +;; check the .servinfo directory, are there other servers running on this +;; or another host? +;; +;; returns #t => ok to start another server +;; #f => not ok to start another server +;; +(define (server:minimal-check areapath) + (server:clean-up-old areapath) + (let* ((srvdir (server:get-servinfo-dir areapath)) ;; (conc areapath"/.servinfo")) + (servrs (glob (conc srvdir"/*"))) + (thishostip (server:get-best-guess-address (get-host-name))) + (thisservrs (glob (conc srvdir"/"thishostip":*"))) + (homehostinf (server:choose-server areapath 'homehost)) + (havehome (car homehostinf)) + (wearehome (cdr homehostinf))) + (debug:print-info 0 *default-log-port* thishostip", have homehost: "havehome", we are homehost: "wearehome + ", numservers: "(length thisservrs)) + (cond + ((not havehome) #t) ;; no homehost yet, go for it + ((and havehome wearehome (< (length thisservrs) 20)) #t) ;; we are home and less than 20 servers, ok to start another + ((and havehome (not wearehome)) #f) ;; we are not the home host + ((and havehome wearehome (>= (length thisservrs) 20)) #f) ;; have enough running + (else + (debug:print 0 *default-log-port* "WARNING: Unrecognised scenario, servrs="servrs", thishostip="thishostip", thisservrs="thisservrs) + #t)))) + + +(define server-last-start 0) + ;; oldest server alive determines host then choose random of youngest ;; five servers on that host ;; ;; mode: @@ -453,29 +468,46 @@ ;; 1. sort by age ascending and ping until good ;; find alive rand from youngest ;; 1. sort by age descending ;; 2. take five ;; 3. check alive, discard if not and repeat + ;; first we clean up old server files + (server:clean-up-old areapath) + (let* ((since-last (- (current-seconds) server-last-start)) + (server-start-delay 10)) + (if ( < (- (current-seconds) server-last-start) 10 ) + (begin + (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start)) + (debug:print 2 *default-log-port* "server:choose-server: last server start less than " server-start-delay " seconds ago. Sleeping " server-start-delay " seconds") + (thread-sleep! server-start-delay) + ) + (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start)) + ) + ) (let* ((serversdat (server:get-servers-info areapath)) (servkeys (hash-table-keys serversdat)) - (by-time-asc (if (not (null? servkeys)) + (by-time-asc (if (not (null? servkeys)) ;; NOTE: Oldest is last (sort servkeys ;; list of "host:port" (lambda (a b) (>= (list-ref (hash-table-ref serversdat a) 2) (list-ref (hash-table-ref serversdat b) 2)))) '()))) + (debug:print 2 *default-log-port* "server:choose-server: serversdat: " serversdat) + (debug:print 2 *default-log-port* "server:choose-server: servkeys: " servkeys) (if (not (null? by-time-asc)) (let* ((oldest (last by-time-asc)) (oldest-dat (hash-table-ref serversdat oldest)) (host (list-ref oldest-dat 0)) (all-valid (filter (lambda (x) (equal? host (list-ref (hash-table-ref serversdat x) 0))) by-time-asc)) - (best-five (lambda () - (if (> (length all-valid) 5) - (take all-valid 5) - all-valid))) + (best-ten (lambda () + (if (> (length all-valid) 11) + (take (drop-right all-valid 1) 10) ;; remove the oldest from consideration so it can age out + (if (> (length all-valid) 8) + (drop-right all-valid 1) + all-valid)))) (names->dats (lambda (names) (map (lambda (x) (hash-table-ref serversdat x)) names))) (am-home? (lambda () @@ -488,25 +520,54 @@ (print "oldest: "oldest-dat", selected host: "host", all-valid: "all-valid) (print "youngest: "(hash-table-ref serversdat (car all-valid)))) ((home) host) ((homehost) (cons host (am-home?))) ;; shut up old code ((home?) (am-home?)) - ((best-five)(names->dats (best-five))) + ((best-ten)(names->dats (best-ten))) ((all-valid)(names->dats all-valid)) - ((best) (let* ((best-five (best-five)) - (len (length best-five))) - (hash-table-ref serversdat (list-ref best-five (random len))))) + ((best) (let* ((best-ten (best-ten)) + (len (length best-ten))) + (hash-table-ref serversdat (list-ref best-ten (random len))))) ((count)(length all-valid)) (else (debug:print 0 *default-log-port* "ERROR: invalid command "mode) #f))) (begin (server:run areapath) - (thread-sleep! 3) + (set! server-last-start (current-seconds)) + ;; (thread-sleep! 3) (case mode ((homehost) (cons #f #f)) (else #f)))))) + +(define (server:get-servinfo-dir areapath) + (let* ((spath (conc areapath"/.servinfo"))) + (if (not (file-exists? spath)) + (create-directory spath #t)) + spath)) + +(define (server:clean-up-old areapath) + ;; any server file that has not been touched in ten minutes is effectively dead + (let* ((sfiles (glob (conc (server:get-servinfo-dir areapath)"/*")))) + (for-each + (lambda (sfile) + (let* ((modtime (handle-exceptions + exn + (begin + (debug:print 0 *default-log-port* "WARNING: failed to get modification file for "sfile) + (current-seconds)) + (file-modification-time sfile)))) + (if (and (number? modtime) + (> (- (current-seconds) modtime) + 600)) + (begin + (debug:print 0 *default-log-port* "WARNING: found old server info file "sfile", removing it.") + (handle-exceptions + exn + (debug:print 0 *default-log-port* "WARNING: failed to delete old server info file "sfile) + (delete-file sfile)))))) + sfiles))) ;; would like to eventually get rid of this ;; (define (common:on-homehost?) (server:choose-server *toppath* 'home?)) @@ -516,11 +577,17 @@ ;; (define (server:kind-run areapath) ;; look for $MT_RUN_AREA_HOME/logs/server-start-last ;; and wait for it to be at least seconds old ;; (server:wait-for-server-start-last-flag areapath) - (if (< (server:choose-server areapath 'count) 10) + (let loop () + (if (> (alist-ref 'adj-proc-load (common:get-normalized-cpu-load #f)) 2) + (begin + (if (common:low-noise-print 30 "our-host-load") + (debug:print 0 *default-log-port* "WARNING: system load is high, waiting to start server.")) + (loop)))) + (if (< (server:choose-server areapath 'count) 20) (server:run areapath)) #;(if (not (server:check-if-running areapath)) ;; why try if there is already a server running? (let* ((lock-file (conc areapath "/logs/server-start.lock"))) (let* ((start-flag (conc areapath "/logs/server-start-last"))) (common:simple-file-lock-and-wait lock-file expire-time: 25) @@ -538,11 +605,12 @@ (let loop ((server-info (server:check-if-running areapath)) (try-num 0)) (if (or server-info (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available. (server:record->url server-info) - (let ((num-ok (length (server:choose-server areapath 'all-valid)))) + (let* ( (servers (server:choose-server areapath 'all-valid)) + (num-ok (if servers (length (server:choose-server areapath 'all-valid)) 0))) (if (and (> try-num 0) ;; first time through simply wait a little while then try again (< num-ok 1)) ;; if there are no decent candidates for servers then try starting a new one (server:run areapath)) (thread-sleep! 5) (loop (server:check-if-running areapath) @@ -555,11 +623,11 @@ ;; no longer care if multiple servers are started by accident. older servers will drop off in time. ;; (define (server:check-if-running areapath) ;; #!key (numservers "2")) (let* ((ns (server:get-num-servers)) ;; get the setting the for maximum number of servers allowed - (servers (server:choose-server areapath 'best-five))) ;; (server:get-best (server:get-list areapath)))) + (servers (server:choose-server areapath 'best-ten))) ;; (server:get-best (server:get-list areapath)))) (if (or (and servers (null? servers)) (not servers)) ;; (and (list? servers) ;; (< (length servers) (+ 1 (random ns))))) ;; somewhere between 1 and numservers @@ -668,11 +736,11 @@ (define (server:expiration-timeout) (let ((tmo (configf:lookup *configdat* "server" "timeout"))) (if (and (string? tmo) (common:hms-string->seconds tmo)) ;; BUG: hms-string->seconds is broken, if given "10" returns 0. Also, it doesn't belong in this logic unless the string->number is changed below (* 3600 (string->number tmo)) - 60))) + 600))) (define (server:get-best-guess-address hostname) (let ((res #f)) (for-each (lambda (adr)