Overview
Comment: | Speculative fixes/defense against couple crashes |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.65 | v1.6546 |
Files: | files | file ages | folders |
SHA1: |
214ac177738fc29784d3b26ae9035502 |
User & Date: | mrwellan on 2020-07-24 17:06:00 |
Other Links: | branch diff | manifest | tags |
Context
2020-07-24
| ||
17:06 | Speculative fixes/defense against couple crashes check-in: 11a1545766 user: mrwellan tags: v1.65, v1.6546 | |
17:06 | Speculative fixes/defense against couple crashes check-in: 214ac17773 user: mrwellan tags: v1.65, v1.6546 | |
2020-07-23
| ||
00:01 | changed mesage output from check for process to go to stderr and use debug:print check-in: 2278ceb10b user: matt tags: v1.65 | |
Changes
Modified common.scm from [c85ab27a6e] to [5ca9ccb479].
︙ | ︙ | |||
690 691 692 693 694 695 696 697 | (read (open-input-string (z3:decode-buffer (base64:base64-decode instr)))))) ;; dot-locking egg seems not to work, using this for now ;; if lock is older than expire-time then remove it and try again ;; to get the lock ;; (define (common:simple-file-lock fname #!key (expire-time 300)) (if (common:file-exists? fname) | > > > > | | | | 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 | (read (open-input-string (z3:decode-buffer (base64:base64-decode instr)))))) ;; dot-locking egg seems not to work, using this for now ;; if lock is older than expire-time then remove it and try again ;; to get the lock ;; (define (common:simple-file-lock fname #!key (expire-time 300)) (let ((fmod-time (handle-exceptions ext (current-seconds) (file-modification-time fname)))) (if (common:file-exists? fname) (if (> (- (current-seconds) fmod-time) expire-time) (begin (handle-exceptions exn #f (delete-file* fname)) (common:simple-file-lock fname expire-time: expire-time)) #f) (let ((key-string (conc (get-host-name) "-" (current-process-id)))) (with-output-to-file fname (lambda () (print key-string))) (thread-sleep! 0.25) (if (common:file-exists? fname) (handle-exceptions exn #f (with-input-from-file fname (lambda () (equal? key-string (read-line))))) #f))))) (define (common:simple-file-lock-and-wait fname #!key (expire-time 300)) (let ((end-time (+ expire-time (current-seconds)))) (let loop ((got-lock (common:simple-file-lock fname expire-time: expire-time))) (if got-lock #t (if (> end-time (current-seconds)) |
︙ | ︙ | |||
1724 1725 1726 1727 1728 1729 1730 | (file-read-access? fullpath)) (handle-exceptions exn #f (debug:print 2 *default-log-port* "reading file " fullpath) (let ((real-age (- (current-seconds)(file-change-time fullpath)))) (if (< real-age age) | > > > > > > | | 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 | (file-read-access? fullpath)) (handle-exceptions exn #f (debug:print 2 *default-log-port* "reading file " fullpath) (let ((real-age (- (current-seconds)(file-change-time fullpath)))) (if (< real-age age) (handle-exceptions exn (begin (debug:print-info 1 *default-log-port* " removing bad file " fullpath) (delete-file* fullpath) #f) (with-input-from-file fullpath read)) (begin (debug:print-info 2 *default-log-port* "file " fullpath " is too old (" real-age" seconds) to trust, skipping reading it") #f)))) (begin (debug:print 2 *default-log-port* "not reading file " fullpath) #f))) #f)) |
︙ | ︙ | |||
2018 2019 2020 2021 2022 2023 2024 | (define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f)) (let* ((loadavg (common:get-cpu-load remote-host)) (numcpus (if (<= 1 numcpus-in) ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again (common:get-num-cpus remote-host) numcpus-in)) (maxload (if force-maxload maxload-in | > > | | 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 | (define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f)) (let* ((loadavg (common:get-cpu-load remote-host)) (numcpus (if (<= 1 numcpus-in) ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again (common:get-num-cpus remote-host) numcpus-in)) (maxload (if force-maxload maxload-in (if (number? maxload-in) (max maxload-in 0.5) 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME? (first (car loadavg)) (next (cadr loadavg)) (adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1 (loadjmp (- first next)) (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)(/ (- 1000 count) 10) waitdelay) (- first adjload) )) )));; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously ;; let's let the user know once in a long while that load checking is happening but not constantly report it (if (> (random 100) 75) ;; about 25% of the time |
︙ | ︙ |
Modified rmt.scm from [63fb23d391] to [e52432dc0d].
︙ | ︙ | |||
251 252 253 254 255 256 257 | ;; (define (extras-case-11 *default-log-port* runremote cmd params attemptnum rid) ;; (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9") ;; (mutex-lock! *rmt-mutex*) (let* ((conninfo (remote-conndat runremote)) | | > > > > > > > > > > > > > > > > > > > > > > > > > > > | 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 | ;; (define (extras-case-11 *default-log-port* runremote cmd params attemptnum rid) ;; (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9") ;; (mutex-lock! *rmt-mutex*) (let* ((conninfo (remote-conndat runremote)) (dat-in (case (remote-transport runremote) ((http) (condition-case ;; handling here has ;; caused a lot of ;; problems. However it ;; is needed to deal with ;; attemtped ;; communication to ;; servers that have gone ;; away (http-transport:client-api-send-receive 0 conninfo cmd params) ((commfail)(vector #f "communications fail")) ((exn)(vector #f "other fail" (print-call-chain))))) (else (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport runremote) " not supported") (exit)))) ;; No Title ;; Error: (vector-ref) out of range ;; #(#<condition: (exn type)> (#("db.scm:3740: regex#regexp" #f #f) #("db.scm:3739: regex#string-substitute" #f #f) #("db.scm:3738: base64#base64-decode" #f #f) #("db.scm:3737: z3#z3:decode-buffer" #f #f) #("db.scm:3736: with-input-from-string" #f #f) #("db.scm:3741: s11n#deserialize" #f #f) #("api.scm:374: api:execute-requests" #f #f) #("api.scm:139: call-with-current-continuation" #f #f) #("api.scm:139: with-exception-handler" #f #f) #("api.scm:139: ##sys#call-with-values" #f #f) #("api.scm:158: string->symbol" #f #f) #("api.scm:160: current-milliseconds" #f #f) #("api.scm:161: dbr:dbstruct-read-only" #f #f) #("api.scm:139: k15" #f #f) #("api.scm:139: g19" #f #f) #("api.scm:142: get-call-chain" #f #f)) #("get-test-info-by-id" (1102 507299))) ;; 6 ;; ;; Call history: ;; ;; http-transport.scm:306: thread-terminate! ;; http-transport.scm:307: debug:print-info ;; common_records.scm:235: debug:debug-mode ;; rmt.scm:259: k587 ;; rmt.scm:259: g591 ;; rmt.scm:276: http-transport:server-dat-update-last-access ;; http-transport.scm:364: current-seconds ;; rmt.scm:282: debug:print-info ;; common_records.scm:235: debug:debug-mode ;; rmt.scm:283: mutex-unlock! ;; rmt.scm:287: extras-transport-succeded <-- ;; +-----------------------------------------------------------------------------+ ;; | Exit Status : 70 ;; (dat (if (and (vector? dat-in) ;; ... check it is a correct size (> (vector-length dat-in) 1)) dat-in (vector #f (conc "communications fail (type 2), dat-in=" dat-in)))) (success (if (vector? dat) (vector-ref dat 0) #f)) (res (if (vector? dat) (vector-ref dat 1) #f))) (if (and (vector? conninfo) (< 5 (vector-length conninfo))) (http-transport:server-dat-update-last-access conninfo) ;; refresh access time (begin (debug:print 0 *default-log-port* "INFO: Should not get here! conninfo=" conninfo) (set! conninfo #f) |
︙ | ︙ |