Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -408,13 +408,22 @@ (define (common:version-changed?) (not (equal? (common:get-last-run-version) (common:version-signature)))) + +;; From 1.70 to 1.80, db's are compatible. + (define (common:api-changed?) - (not (equal? (substring (->string megatest-version) 0 4) - (substring (conc (common:get-last-run-version)) 0 4)))) + (let* ( + (megatest-major-version (substring (->string megatest-version) 0 4)) + (run-major-version (substring (conc (common:get-last-run-version)) 0 4)) + ) + (and (not (equal? megatest-major-version "1.80")) + (not (equal? megatest-major-version megatest-run-version))) + ) +) ;;====================================================================== ;; Move me elsewhere ... ;; RADT => Why do we meed the version check here, this is called only if version misma ;; @@ -599,14 +608,15 @@ ;;====================================================================== ;; Force a megatest cleanup-db if version is changed and skip-version-check not specified ;; Do NOT check if not on homehost! ;; (define (common:exit-on-version-changed) - (if (common:on-homehost?) + (if (and *toppath* ;; do nothing if *toppath* not yet provided + (common:on-homehost?)) (if (common:api-changed?) (let* ((mtconf (conc (get-environment-variable "MT_RUN_AREA_HOME") "/megatest.config")) - (dbfile (conc (get-environment-variable "MT_RUN_AREA_HOME") "/megatest.db")) + (dbfile (conc (get-environment-variable "MT_RUN_AREA_HOME") ".megatest/main.db")) (read-only (not (file-write-access? dbfile))) (dbstruct (db:setup #t))) ;; (db:setup-db *dbstruct-dbs* *toppath* #f))) ;; #t))) (debug:print 0 *default-log-port* "WARNING: Version mismatch!\n" " expected: " (common:version-signature) "\n" @@ -626,14 +636,14 @@ (common:cleanup-db dbstruct))) ((not (common:file-exists? mtconf)) (debug:print 0 *default-log-port* " megatest.config does not exist in this area. Cannot proceed with megatest version migration.") (exit 1)) ((not (common:file-exists? dbfile)) - (debug:print 0 *default-log-port* " megatest.db does not exist in this area. Cannot proceed with megatest version migration.") + (debug:print 0 *default-log-port* " .megatest/main.db does not exist in this area. Cannot proceed with megatest version migration.") (exit 1)) ((not (eq? (current-user-id)(file-owner mtconf))) - (debug:print 0 *default-log-port* " You do not own megatest.db in this area. Cannot proceed with megatest version migration.") + (debug:print 0 *default-log-port* " You do not own .megatest/main.db in this area. Cannot proceed with megatest version migration.") (exit 1)) (read-only (debug:print 0 *default-log-port* " You have read-only access to this area. Cannot proceed with megatest version migration.") (exit 1)) (else @@ -980,12 +990,13 @@ ;;====================================================================== ;; E X I T H A N D L I N G ;;====================================================================== (define (common:run-sync?) - (and (common:on-homehost?) - (args:get-arg "-server"))) + (and *toppath* ;; gate if called before *toppath* is set + (common:on-homehost?) + (args:get-arg "-server"))) (define (common:human-time) (time->string (seconds->local-time (current-seconds)) "%Y-%m-%d %H:%M:%S")) @@ -1989,10 +2000,17 @@ (host-last-used-set! rec curr-time) new-best) (if (null? tal) #f (loop (car tal)(cdr tal) best-host))))))))) (define (common:wait-for-homehost-load maxnormload msg) + (let loop ((start-time (current-seconds))) ;; we saw some instances of this being called before *toppath* was set. This might be an early setup race. This delay should help but it is impossible to test... + (if (not *toppath*) + (begin + (debug:print 0 *default-log-port* "ERROR: common:wait-for-homehost-load called before *toppath* set.") + (thread-sleep! 30) + (if (< (- (current-seconds) start-time) 300) + (loop start-time))))) (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local. #f (server:choose-server *toppath* 'homehost))) (hh (if hh-dat (car hh-dat) #f))) (common:wait-for-normalized-load maxnormload msg hh))) Index: dbfile.scm ================================================================== --- dbfile.scm +++ dbfile.scm @@ -328,11 +328,11 @@ (current-error-port) (lambda () (apply print params)))) (define (dbfile:cautious-open-database fname init-proc sync-mode journal-mode #!optional (tries-left 500)) - (let* ((busy-file (conc fname"-journal")) + (let* ((busy-file (conc fname "-journal")) (delay-time (* (- 51 tries-left) 1.1)) (write-access (file-write-access? fname)) (dir-access (file-write-access? (pathname-directory fname))) (retry (lambda () (thread-sleep! delay-time) @@ -513,11 +513,12 @@ (db:sync-touched dbstruct runid keys dbinit) (set! *db-sync-in-progress* #f) (delete-file* lock-file) #t) (begin - (dbfile:print-err "INFO: could not get lock for " from-db-file ", sync likely in progress.") + (if (common:low-noise-print 120 (conc "no lock "from-db-file)) + (dbfile:print-err "INFO: could not get lock for " from-db-file ", sync likely in progress.")) #f )))) ;; ;; Get a lock from the no-sync-db for the from-db, then delta sync the from-db to the to-db, otherwise return #f ;; ;; Index: megatest-version.scm ================================================================== --- megatest-version.scm +++ megatest-version.scm @@ -18,6 +18,6 @@ ;; Always use two or four digit decimal ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00.. ;; (declare (unit megatest-version)) -(define megatest-version 1.7009) +(define megatest-version 1.8001) Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -190,46 +190,48 @@ (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+)")) ;; SERVER STARTED: host:port AT timesecs server id (dbprep-rx (regexp "^SERVER: dbprep")) (dbprep-found 0) (bad-dat (list #f #f #f #f #f))) (handle-exceptions - exn - (begin - (debug:print-info 0 *default-log-port* "Unable to get server info from " logf ", exn=" exn) - bad-dat) ;; no idea what went wrong, call it a bad server - (with-input-from-file - logf - (lambda () - (let loop ((inl (read-line)) - (lnum 0)) - (if (not (eof-object? inl)) - (let ((mlst (string-match server-rx inl)) - (dbprep (string-match dbprep-rx inl))) - (if dbprep (set! dbprep-found 1)) - (if (not mlst) - (if (< lnum 500) ;; give up if more than 500 lines of server log read - (loop (read-line)(+ lnum 1)) - (begin + exn + (begin + ;; WARNING: this is potentially dangerous to blanket ignore the errors + (if (file-exists? logf) + (debug:print-info 0 *default-log-port* "Unable to get server info from "logf", exn=" exn)) + bad-dat) ;; no idea what went wrong, call it a bad server + (with-input-from-file + logf + (lambda () + (let loop ((inl (read-line)) + (lnum 0)) + (if (not (eof-object? inl)) + (let ((mlst (string-match server-rx inl)) + (dbprep (string-match dbprep-rx inl))) + (if dbprep (set! dbprep-found 1)) + (if (not mlst) + (if (< lnum 500) ;; give up if more than 500 lines of server log read + (loop (read-line)(+ lnum 1)) + (begin (debug:print-info 0 *default-log-port* "Unable to get server info from first 500 lines of " logf ) bad-dat)) - (match mlst - ((_ host port start server-id pid) - (list host - (string->number port) - (string->number start) - server-id - (string->number pid))) - (else - (debug:print 0 *current-log-port* "ERROR: did not recognise SERVER line info "mlst) - bad-dat)))) - (begin - (if dbprep-found - (begin - (debug:print-info 2 *default-log-port* "Server is in dbprep at " (common:human-time)) - (thread-sleep! 0.5)) ;; was 25 sec but that blocked things from starting? - (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds)))) - bad-dat)))))))) + (match mlst + ((_ host port start server-id pid) + (list host + (string->number port) + (string->number start) + server-id + (string->number pid))) + (else + (debug:print 0 *current-log-port* "ERROR: did not recognise SERVER line info "mlst) + bad-dat)))) + (begin + (if dbprep-found + (begin + (debug:print-info 2 *default-log-port* "Server is in dbprep at " (common:human-time)) + (thread-sleep! 0.5)) ;; was 25 sec but that blocked things from starting? + (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds)))) + bad-dat)))))))) ;; ;; get a list of servers from the log files, with all relevant data ;; ;; ( mod-time host port start-time pid ) ;; ;; ;; (define (server:get-list areapath #!key (limit #f)) @@ -419,10 +421,11 @@ ;; oldest server alive determines host then choose random of youngest ;; five servers on that host ;; (define (server:get-servers-info areapath) + (assert *toppath* "FATAL: server:get-servers-info called before *toppath* has been set.") (let* ((servinfodir (conc *toppath*"/.servinfo"))) (if (not (file-exists? servinfodir)) (create-directory servinfodir)) (let* ((allfiles (glob (conc servinfodir"/*"))) (res (make-hash-table))) @@ -468,13 +471,13 @@ (oldest-dat (hash-table-ref serversdat oldest)) (host (list-ref oldest-dat 0)) (all-valid (filter (lambda (x) (equal? host (list-ref (hash-table-ref serversdat x) 0))) by-time-asc)) - (best-five (lambda () - (if (> (length all-valid) 5) - (take all-valid 5) + (best-ten (lambda () + (if (> (length all-valid) 10) + (take all-valid 10) all-valid))) (names->dats (lambda (names) (map (lambda (x) (hash-table-ref serversdat x)) names))) @@ -488,15 +491,15 @@ (print "oldest: "oldest-dat", selected host: "host", all-valid: "all-valid) (print "youngest: "(hash-table-ref serversdat (car all-valid)))) ((home) host) ((homehost) (cons host (am-home?))) ;; shut up old code ((home?) (am-home?)) - ((best-five)(names->dats (best-five))) + ((best-ten)(names->dats (best-ten))) ((all-valid)(names->dats all-valid)) - ((best) (let* ((best-five (best-five)) - (len (length best-five))) - (hash-table-ref serversdat (list-ref best-five (random len))))) + ((best) (let* ((best-ten (best-ten)) + (len (length best-ten))) + (hash-table-ref serversdat (list-ref best-ten (random len))))) ((count)(length all-valid)) (else (debug:print 0 *default-log-port* "ERROR: invalid command "mode) #f))) (begin @@ -516,11 +519,11 @@ ;; (define (server:kind-run areapath) ;; look for $MT_RUN_AREA_HOME/logs/server-start-last ;; and wait for it to be at least seconds old ;; (server:wait-for-server-start-last-flag areapath) - (if (< (server:choose-server areapath 'count) 10) + (if (< (server:choose-server areapath 'count) 20) (server:run areapath)) #;(if (not (server:check-if-running areapath)) ;; why try if there is already a server running? (let* ((lock-file (conc areapath "/logs/server-start.lock"))) (let* ((start-flag (conc areapath "/logs/server-start-last"))) (common:simple-file-lock-and-wait lock-file expire-time: 25) @@ -555,11 +558,11 @@ ;; no longer care if multiple servers are started by accident. older servers will drop off in time. ;; (define (server:check-if-running areapath) ;; #!key (numservers "2")) (let* ((ns (server:get-num-servers)) ;; get the setting the for maximum number of servers allowed - (servers (server:choose-server areapath 'best-five))) ;; (server:get-best (server:get-list areapath)))) + (servers (server:choose-server areapath 'best-ten))) ;; (server:get-best (server:get-list areapath)))) (if (or (and servers (null? servers)) (not servers)) ;; (and (list? servers) ;; (< (length servers) (+ 1 (random ns))))) ;; somewhere between 1 and numservers