Overview
Comment: | Avoided server logs that have exited. Detected when a server is in dbprep, and wait 25 seconds for that to finish. |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.65-real |
Files: | files | file ages | folders |
SHA1: |
bbf1632194b88c6bb401c73017fa79b2 |
User & Date: | mmgraham on 2021-02-12 12:23:13 |
Other Links: | branch diff | manifest | tags |
Context
2021-02-12
| ||
16:44 | Eliminated load delays of less than 1 second. Eliminated INFO when load is acceptable. check-in: bd581f5fff user: mmgraham tags: v1.65-real | |
12:23 | Avoided server logs that have exited. Detected when a server is in dbprep, and wait 25 seconds for that to finish. check-in: bbf1632194 user: mmgraham tags: v1.65-real | |
2021-02-05
| ||
15:36 | merged in changes for wildcard target in remove runs, support for dashboard in sles12, and archival of rerun data. check-in: 6ff4310a7a user: mmgraham tags: v1.65-real, v1.6583 | |
Changes
Modified common.scm from [bf0a0a25ad] to [28bc361690].
︙ | ︙ | |||
2190 2191 2192 2193 2194 2195 2196 | (normalized-effective-load (/ effective-load numcpus)) (will-wait (> normalized-effective-load maxnormload))) (if (> recommended-delay 0) (let* ((actual-delay (min recommended-delay 30))) (if (common:low-noise-print 30 (conc (round actual-delay) "-safe-load")) (debug:print-info 0 *default-log-port* "Load control, delaying " actual-delay " seconds to maintain safe load. current normalized effective load is " | | | 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 | (normalized-effective-load (/ effective-load numcpus)) (will-wait (> normalized-effective-load maxnormload))) (if (> recommended-delay 0) (let* ((actual-delay (min recommended-delay 30))) (if (common:low-noise-print 30 (conc (round actual-delay) "-safe-load")) (debug:print-info 0 *default-log-port* "Load control, delaying " actual-delay " seconds to maintain safe load. current normalized effective load is " normalized-effective-load". maxnormload = " maxnormload " numcpus = " numcpus " loadavg = " loadavg " effective-load = " effective-load)) (thread-sleep! actual-delay))) (cond ;; bad data, try again to get the data ((not will-wait) (if (common:low-noise-print 30 (conc (round normalized-effective-load) "-load-acceptable-" effective-host)) (debug:print 0 *default-log-port* "Effective load on " effective-host " is acceptable at " effective-load " continuing."))) |
︙ | ︙ |
Modified server.scm from [e1c4f3709b] to [9bfbdb6820].
︙ | ︙ | |||
165 166 167 168 169 170 171 | (thread-join! log-rotate) (pop-directory))) ;; given a path to a server log return: host port startseconds ;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let (define (server:logf-get-start-info logf) | | > > | > > > > > > > > > > | > | 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 | (thread-join! log-rotate) (pop-directory))) ;; given a path to a server log return: host port startseconds ;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let (define (server:logf-get-start-info logf) (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id (dbprep-rx (regexp "^SERVER: dbprep")) (dbprep-found 0)) (handle-exceptions exn (begin (debug:print-info 0 *default-log-port* "Unable to get server info from " logf ", exn=" exn) (list #f #f #f #f)) ;; no idea what went wrong, call it a bad server (with-input-from-file logf (lambda () (let loop ((inl (read-line)) (lnum 0)) (if (not (eof-object? inl)) (let ((mlst (string-match server-rx inl)) (dbprep (string-match dbprep-rx inl)) ) (if dbprep (set! dbprep-found 1) ) (if (not mlst) (if (< lnum 500) ;; give up if more than 500 lines of server log read (loop (read-line)(+ lnum 1)) (begin (debug:print-info 0 *default-log-port* "Unable to get server info from first 500 lines of " logf ) (list #f #f #f #f))) (let ((dat (cdr mlst))) (list (car dat) ;; host (string->number (cadr dat)) ;; port (string->number (caddr dat)) (cadr (cddr dat)))))) (begin (if dbprep-found (begin (debug:print-info 0 *default-log-port* "Server is in dbprep at " (current-seconds)) (thread-sleep! 25) ) (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (current-seconds)) ) (list #f #f #f #f))))))))) ;; get a list of servers with all relevant data ;; ( mod-time host port start-time pid ) ;; (define (server:get-list areapath #!key (limit #f)) (let ((fname-rx (regexp "^(|.*/)server-(\\d+)-(\\S+).log$")) |
︙ | ︙ | |||
212 213 214 215 216 217 218 | (begin (condition-case (create-directory (conc areapath "/logs") #t) (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs"))) (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list. exn=" exn))) (directory-exists? (conc areapath "/logs"))) '())) | | > > > | > > | > | | 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 | (begin (condition-case (create-directory (conc areapath "/logs") #t) (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs"))) (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list. exn=" exn))) (directory-exists? (conc areapath "/logs"))) '())) ;; Get the list of server logs that do not contain "exiting". Ignore logs for servers that have exited. (let* ((server-logs-cmd (conc "grep -iL exiting " areapath "/logs/server-*-*.log")) (server-logs (string-split (string-chomp (call-with-input-pipe server-logs-cmd read-all)))) (num-serv-logs (length server-logs))) (if (or (null? server-logs) (= num-serv-logs 0)) (let () (debug:print 1 *default-log-port* "There are no servers running") '() ) (let loop ((hed (string-chomp (car server-logs))) (tal (cdr server-logs)) (res '())) (let* ((mod-time (handle-exceptions exn (begin (debug:print 0 *default-log-port* "failed to get modification time on " hed ", exn=" exn) (current-seconds)) ;; 0 |
︙ | ︙ | |||
241 242 243 244 245 246 247 | res (cons (append serv-rec (list pid)) res)))) ;; any changes to number of elements in new-res will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let (if (null? tal) (if (and limit (> (length new-res) limit)) new-res ;; (take new-res limit) <= need intelligent sorting before this will work new-res) | | | 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 | res (cons (append serv-rec (list pid)) res)))) ;; any changes to number of elements in new-res will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let (if (null? tal) (if (and limit (> (length new-res) limit)) new-res ;; (take new-res limit) <= need intelligent sorting before this will work new-res) (loop (string-chomp (car tal)) (cdr tal) new-res))))))))) (define (server:get-num-alive srvlst) (let ((num-alive 0)) (for-each (lambda (server) (handle-exceptions exn |
︙ | ︙ | |||
357 358 359 360 361 362 363 364 365 366 367 368 369 370 | (server-key (conc (get-host-name) "-" (current-process-id)))) (if (file-exists? start-flag) (let* ((fmodtime (file-modification-time start-flag)) (delta (- (current-seconds) fmodtime)) (all-go (> delta reftime))) (if (and all-go (begin (with-output-to-file start-flag (lambda () (print server-key))) (thread-sleep! 0.25) (let ((res (with-input-from-file start-flag (lambda () (read-line))))) | > | 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 | (server-key (conc (get-host-name) "-" (current-process-id)))) (if (file-exists? start-flag) (let* ((fmodtime (file-modification-time start-flag)) (delta (- (current-seconds) fmodtime)) (all-go (> delta reftime))) (if (and all-go (begin (debug:print-info 0 *default-log-port* "Writing " start-flag) (with-output-to-file start-flag (lambda () (print server-key))) (thread-sleep! 0.25) (let ((res (with-input-from-file start-flag (lambda () (read-line))))) |
︙ | ︙ | |||
393 394 395 396 397 398 399 400 401 402 403 404 405 406 | ((2) 300) (else 600)) (random 5))) ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously (lock-file (conc areapath "/logs/server-start.lock"))) (if (> (- (current-seconds) when-run) run-delay) (let* ((start-flag (conc areapath "/logs/server-start-last"))) (common:simple-file-lock-and-wait lock-file expire-time: 15) (system (conc "touch " start-flag)) ;; lazy but safe (server:run areapath) (thread-sleep! 2) ;; don't release the lock for at least a few seconds (common:simple-file-release-lock lock-file))) (hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds)))))) ;; this one seems to be the general entry point | > | 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 | ((2) 300) (else 600)) (random 5))) ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously (lock-file (conc areapath "/logs/server-start.lock"))) (if (> (- (current-seconds) when-run) run-delay) (let* ((start-flag (conc areapath "/logs/server-start-last"))) (common:simple-file-lock-and-wait lock-file expire-time: 15) (debug:print-info 0 *default-log-port* "server:kind-run: touching " start-flag) (system (conc "touch " start-flag)) ;; lazy but safe (server:run areapath) (thread-sleep! 2) ;; don't release the lock for at least a few seconds (common:simple-file-release-lock lock-file))) (hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds)))))) ;; this one seems to be the general entry point |
︙ | ︙ |
Modified tests.scm from [0094b671e6] to [58a365a2ab].
︙ | ︙ | |||
64 65 66 67 68 69 70 | (if section (map cadr section) '())))) (filter (lambda (d) (if (directory-exists? d) d (begin | | | | 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 | (if section (map cadr section) '())))) (filter (lambda (d) (if (directory-exists? d) d (begin ;; (if (common:low-noise-print 60 "tests:get-tests-search-path" d) ;; (debug:print 0 *default-log-port* "WARNING: problem with directory " d ", dropping it from tests path")) #f))) (append paths (list (conc *toppath* "/tests")))))) (define (tests:get-valid-tests test-registry tests-paths) (if (null? tests-paths) test-registry (let loop ((hed (car tests-paths)) |
︙ | ︙ |