Overview
Comment: | 99.5% done with protecting db access with journal check |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.60 |
Files: | files | file ages | folders |
SHA1: |
6757cdb9b3b579cafc1cd8f2904aacf0 |
User & Date: | matt on 2014-11-12 21:51:39 |
Other Links: | branch diff | manifest | tags |
Context
2014-11-12
| ||
22:40 | 99.8% check-in: edc56c4136 user: matt tags: v1.60 | |
21:51 | 99.5% done with protecting db access with journal check check-in: 6757cdb9b3 user: matt tags: v1.60 | |
17:14 | 98% done check-in: 24e4d63419 user: mrwellan tags: v1.60 | |
Changes
Modified client.scm from [dc8b2be6ad] to [406d30b1f6].
︙ | ︙ | |||
53 54 55 56 57 58 59 | ;; ;; client:setup ;; ;; lookup_server, need to remove *runremote* stuff ;; (define (client:setup run-id #!key (remaining-tries 100) (failed-connects 0)) (debug:print-info 2 "client:setup remaining-tries=" remaining-tries) | > > | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 | ;; ;; client:setup ;; ;; lookup_server, need to remove *runremote* stuff ;; (define (client:setup run-id #!key (remaining-tries 100) (failed-connects 0)) (debug:print-info 2 "client:setup remaining-tries=" remaining-tries) (let* ((tdbdat (tasks:open-db)) (tdb (db:dbdat-get-db tdbdat))) (if (<= remaining-tries 0) (begin (debug:print 0 "ERROR: failed to start or connect to server for run-id " run-id) (exit 1)) (let ((host-info (hash-table-ref/default *runremote* run-id #f))) (if host-info (let* ((iface (http-transport:server-dat-get-iface host-info)) (port (http-transport:server-dat-get-port host-info)) (start-res (http-transport:client-connect iface port)) (ping-res (rmt:login-no-auto-client-setup start-res run-id))) (if ping-res ;; sucessful login? (begin (debug:print-info 2 "client:setup, ping is good using host-info=" host-info ", remaining-tries=" remaining-tries) ;; Why add the close-connections here? ;; (http-transport:close-connections run-id) (hash-table-set! *runremote* run-id start-res) start-res) ;; return the server info ;; have host info but no ping. shutdown the current connection and try again (begin ;; login failed (debug:print-info 1 "client:setup, ping is bad for start-res=" start-res " and *runremote*=" host-info) (http-transport:close-connections run-id) (hash-table-delete! *runremote* run-id) (if (< remaining-tries 8) (thread-sleep! 5) (thread-sleep! 1)) (client:setup run-id remaining-tries: (- remaining-tries 1))))) ;; YUK: rename server-dat here (let* ((server-dat (tasks:get-server (db:delay-if-busy tdbdat) run-id))) (debug:print-info 4 "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) (if server-dat (let* ((iface (tasks:hostinfo-get-interface server-dat)) (port (tasks:hostinfo-get-port server-dat)) (start-res (http-transport:client-connect iface port)) (ping-res (rmt:login-no-auto-client-setup start-res run-id))) (if (and start-res ping-res) (begin (hash-table-set! *runremote* run-id start-res) (debug:print-info 2 "connected to " (http-transport:server-dat-make-url start-res)) start-res) (begin ;; login failed but have a server record, clean out the record and try again (debug:print-info 0 "client:setup, login failed, will attempt to start server ... start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat) (http-transport:close-connections run-id) (hash-table-delete! *runremote* run-id) (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat) run-id (tasks:hostinfo-get-interface server-dat) (tasks:hostinfo-get-port server-dat) " client:setup (server-dat = #t)") (thread-sleep! 2) (server:try-running run-id) (thread-sleep! 10) ;; give server a little time to start up (client:setup run-id remaining-tries: (- remaining-tries 1))))) (begin ;; no server registered (let ((num-available (tasks:num-in-available-state (db:dbdat-get-db tdbdat) run-id))) (debug:print-info 0 "client:setup, no server registered, remaining-tries=" remaining-tries " num-available=" num-available) (thread-sleep! 2) (if (< num-available 2) (begin ;; (open-run-close tasks:server-clean-out-old-records-for-run-id tasks:open-db run-id " client:setup (server-dat=#f)") (server:try-running run-id))) (thread-sleep! 10) ;; give server a little time to start up (client:setup run-id remaining-tries: (- remaining-tries 1))))))))))) ;; keep this as a function to ease future (define (client:start run-id server-info) (http-transport:client-connect (tasks:hostinfo-get-interface server-info) (tasks:hostinfo-get-port server-info))) ;; client:signal-handler |
︙ | ︙ |
Modified dashboard.scm from [c1ff2abbf5] to [ea8b1971cf].
︙ | ︙ | |||
1451 1452 1453 1454 1455 1456 1457 | (> modtime last-db-update-time) (> (current-seconds)(+ last-db-update-time 1))))) (define *monitor-db-path* (conc *dbdir* "/monitor.db")) (define *last-monitor-update-time* 0) ;; Force creation of the db in case it isn't already there. | | < | 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 | (> modtime last-db-update-time) (> (current-seconds)(+ last-db-update-time 1))))) (define *monitor-db-path* (conc *dbdir* "/monitor.db")) (define *last-monitor-update-time* 0) ;; Force creation of the db in case it isn't already there. (tasks:open-db) (define (dashboard:get-youngest-run-db-mod-time) (handle-exceptions exn (begin (debug:print 0 "WARNING: error in accessing databases in get-youngest-run-db-mod-time: " ((condition-property-accessor 'exn 'message) exn)) (current-seconds)) ;; something went wrong - just print an error and return current-seconds |
︙ | ︙ |
Modified db.scm from [4ac80e9f1a] to [71a8762428].
︙ | ︙ | |||
473 474 475 476 477 478 479 | (db:dbdat-get-db todb) full-sel) ;; first pass implementation, just insert all changed rows (for-each (lambda (targdb) (let* ((db (db:dbdat-get-db targdb)) | | | | | 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 | (db:dbdat-get-db todb) full-sel) ;; first pass implementation, just insert all changed rows (for-each (lambda (targdb) (let* ((db (db:dbdat-get-db targdb)) (stmth (sqlite3:prepare db full-ins))) ;; (db:delay-if-busy targdb) ;; NO WAITING (sqlite3:with-transaction db (lambda () (for-each ;; (lambda (fromrow) (let* ((a (vector-ref fromrow 0)) (curr (hash-table-ref/default todat a #f)) (same #t)) (let loop ((i 0)) |
︙ | ︙ | |||
532 533 534 535 536 537 538 | (dbstruct (if toppath (make-dbr:dbstruct path: toppath) #f)) (mtdb (if toppath (db:open-megatest-db))) (run-ids (if run-ids run-ids (if toppath (begin (db:delay-if-busy mtdb) (db:get-all-run-ids mtdb))))) | | | | | 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 | (dbstruct (if toppath (make-dbr:dbstruct path: toppath) #f)) (mtdb (if toppath (db:open-megatest-db))) (run-ids (if run-ids run-ids (if toppath (begin (db:delay-if-busy mtdb) (db:get-all-run-ids mtdb))))) (tdbdat (tasks:open-db)) (servers (tasks:get-all-servers (db:delay-if-busy tdbdat)))) ;; kill servers (if (member 'killservers options) (for-each (lambda (server) (tasks:server-delete-record (db:delay-if-busy tdbdat) (vector-ref server 0) "dbmigration") (tasks:kill-server (vector-ref server 2)(vector-ref server 1))) servers)) ;; clear out junk records ;; (if (member 'dejunk options) (begin |
︙ | ︙ | |||
584 585 586 587 588 589 590 | (frundb (db:dbdat-get-db (db:get-db fromdb run-id)))) (db:delay-if-busy frundb) (db:delay-if-busy mtdb) (if (eq? run-id 0) (db:sync-tables (db:sync-main-list dbstruct) fromdb mtdb) (db:sync-tables db:sync-tests-only fromdb mtdb)))) run-ids)) | | | > | 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 | (frundb (db:dbdat-get-db (db:get-db fromdb run-id)))) (db:delay-if-busy frundb) (db:delay-if-busy mtdb) (if (eq? run-id 0) (db:sync-tables (db:sync-main-list dbstruct) fromdb mtdb) (db:sync-tables db:sync-tests-only fromdb mtdb)))) run-ids)) ;; (db:close-all dbstruct) ;; (sqlite3:finalize! mdb) )) ;; keeping it around for debugging purposes only (define (open-run-close-no-exception-handling proc idb . params) (debug:print-info 11 "open-run-close-no-exception-handling START given a db=" (if idb "yes " "no ") ", params=" params) (if (or *db-write-access* (not (member proc *db:all-write-procs*))) (let* ((db (cond |
︙ | ︙ | |||
1828 1829 1830 1831 1832 1833 1834 | pid test-id)))) (define (db:test-get-top-process-pid dbstruct run-id test-id) (db:with-db dbstruct run-id #f | > | | | 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 | pid test-id)))) (define (db:test-get-top-process-pid dbstruct run-id test-id) (db:with-db dbstruct run-id #f (lambda (db) (sqlite3:first-result db "SELECT attemptnum FROM tests WHERE id=?;" test-id)))) (define db:test-record-fields '("id" "run_id" "testname" "state" "status" "event_time" "host" "cpuload" "diskfree" "uname" "rundir" "item_path" "run_duration" "final_logf" "comment" "shortdir" "attemptnum")) ;; fields *must* be a non-empty list ;; |
︙ | ︙ | |||
2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 | (if (null? tal) (map cdr (hash-table->alist tests-hash)) ;; return a list of the most recent tests (loop (car tal)(cdr tal)))))))))) (define (db:delay-if-busy dbdat #!key (count 6)) (if dbdat (let* ((dbpath (db:dbdat-get-path dbdat)) (dbfj (conc dbpath "-journal"))) (if (file-exists? dbfj) (case count ((6) (thread-sleep! 0.2) (db:delay-if-busy count: 5)) ((5) | > | 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 | (if (null? tal) (map cdr (hash-table->alist tests-hash)) ;; return a list of the most recent tests (loop (car tal)(cdr tal)))))))))) (define (db:delay-if-busy dbdat #!key (count 6)) (if dbdat (let* ((dbpath (db:dbdat-get-path dbdat)) (db (db:dbdat-get-db dbdat)) ;; we'll return this so (db:delay--if-busy can be called inline (dbfj (conc dbpath "-journal"))) (if (file-exists? dbfj) (case count ((6) (thread-sleep! 0.2) (db:delay-if-busy count: 5)) ((5) |
︙ | ︙ | |||
2455 2456 2457 2458 2459 2460 2461 | (thread-sleep! 3.2) (db:delay-if-busy count: 1)) ((1) (thread-sleep! 6.4) (db:delay-if-busy count: 0)) (else (debug:print-info 0 "delaying db access due to high database load.") | | > > | 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 | (thread-sleep! 3.2) (db:delay-if-busy count: 1)) ((1) (thread-sleep! 6.4) (db:delay-if-busy count: 0)) (else (debug:print-info 0 "delaying db access due to high database load.") (thread-sleep! 12.8)))) db) "bogus result from db:delay-if-busy")) (define (db:test-get-records-for-index-file dbstruct run-id test-name) (let ((res '())) (db:with-db dbstruct run-id #f |
︙ | ︙ |
Modified dcommon.scm from [ed88f64fcd] to [e887ed7ced].
︙ | ︙ | |||
444 445 446 447 448 449 450 | (set! dashboard:update-summary-tab updater) (iup:attribute-set! stats-matrix "WIDTHDEF" "40") (iup:vbox ;; (iup:label "Run statistics" #:expand "HORIZONTAL") stats-matrix))) (define (dcommon:servers-table) | > > | | | 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 | (set! dashboard:update-summary-tab updater) (iup:attribute-set! stats-matrix "WIDTHDEF" "40") (iup:vbox ;; (iup:label "Run statistics" #:expand "HORIZONTAL") stats-matrix))) (define (dcommon:servers-table) (let* ((tdbdat (tasks:open-db)) (tdb (db:dbdat-get-db tdbdat)) (colnum 0) (rownum 0) (servers-matrix (iup:matrix #:expand "YES" #:numcol 7 #:numcol-visible 7 #:numlin-visible 5 )) (colnames (list "Id" "MTver" "Pid" "Host" "Interface:OutPort" "RunTime" "State" "RunId")) (updater (lambda () (let ((servers (tasks:get-all-servers (db:delay-if-busy tdbdat)))) (iup:attribute-set! servers-matrix "NUMLIN" (length servers)) ;; (set! colnum 0) ;; (for-each (lambda (colname) ;; ;; (print "colnum: " colnum " colname: " colname) ;; (iup:attribute-set! servers-matrix (conc "0:" colnum) colname) ;; (set! colnum (+ 1 colnum))) ;; colnames) |
︙ | ︙ |
Modified http-transport.scm from [c83e2578f6] to [8d5a62d976].
︙ | ︙ | |||
132 133 134 135 136 137 138 | headers: '((content-type text/plain)))) (else (continue)))))))) (http-transport:try-start-server run-id ipaddrstr start-port server-id))) ;; This is recursively run by http-transport:run until sucessful ;; (define (http-transport:try-start-server run-id ipaddrstr portnum server-id) | | > < | < | < | | 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 | headers: '((content-type text/plain)))) (else (continue)))))))) (http-transport:try-start-server run-id ipaddrstr start-port server-id))) ;; This is recursively run by http-transport:run until sucessful ;; (define (http-transport:try-start-server run-id ipaddrstr portnum server-id) (let ((config-hostname (configf:lookup *configdat* "server" "hostname")) (tdbdat (tasks:open-db))) (debug:print-info 0 "http-transport:try-start-server run-id=" run-id " ipaddrsstr=" ipaddrstr " portnum=" portnum " server-id=" server-id " config-hostname=" config-hostname) (handle-exceptions exn (begin (print-error-message exn) (if (< portnum 64000) (begin (debug:print 0 "WARNING: attempt to start server failed. Trying again ...") (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (debug:print 0 "exn=" (condition->list exn)) (portlogger:open-run-close portlogger:set-failed portnum) (debug:print 0 "WARNING: failed to start on portnum: " portnum ", trying next port") (thread-sleep! 0.1) ;; get_next_port goes here (http-transport:try-start-server run-id ipaddrstr (portlogger:open-run-close portlogger:find-port) server-id)) (begin (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat) run-id ipaddrstr portnum " http-transport:try-start-server") (print "ERROR: Tried and tried but could not start the server")))) ;; any error in following steps will result in a retry (set! *server-info* (list ipaddrstr portnum)) (tasks:server-set-interface-port (db:delay-if-busy tdbdat) server-id ipaddrstr portnum) (debug:print 0 "INFO: Trying to start server on " ipaddrstr ":" portnum) ;; This starts the spiffy server ;; NEED WAY TO SET IP TO #f TO BIND ALL ;; (start-server bind-address: ipaddrstr port: portnum) (if config-hostname ;; this is a hint to bind directly (start-server port: portnum bind-address: (if (equal? config-hostname "-") ipaddrstr config-hostname)) (start-server port: portnum)) ;; (portlogger:open-run-close portlogger:set-port portnum "released") (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat) run-id ipaddrstr portnum " http-transport:try-start-server") (debug:print 1 "INFO: server has been stopped")))) ;;====================================================================== ;; S E R V E R U T I L I T I E S ;;====================================================================== ;;====================================================================== |
︙ | ︙ | |||
257 258 259 260 261 262 263 | exn (debug:print 0 "WARNING: closing connections failed. Server at " fullurl " almost certainly dead") (close-all-connections!)) (debug:print 0 "WARNING: Failed to communicate with server, trying again, numretries left: " numretries) (http-transport:client-api-send-receive run-id serverdat cmd params numretries: (- numretries 1))) (begin (mutex-unlock! *http-mutex*) | < | | 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 | exn (debug:print 0 "WARNING: closing connections failed. Server at " fullurl " almost certainly dead") (close-all-connections!)) (debug:print 0 "WARNING: Failed to communicate with server, trying again, numretries left: " numretries) (http-transport:client-api-send-receive run-id serverdat cmd params numretries: (- numretries 1))) (begin (mutex-unlock! *http-mutex*) (tasks:kill-server-run-id run-id) #f)) (begin (debug:print-info 11 "fullurl=" fullurl ", cmd=" cmd ", params=" params ", run-id=" run-id "\n") ;; set up the http-client here (max-retry-attempts 1) ;; consider all requests indempotent (retry-request? (lambda (request) #f)) ;; send the data and get the response ;; extract the needed info from the http data and ;; process and return it. (let* ((send-recieve (lambda () (mutex-lock! *http-mutex*) ;; (condition-case (with-input-from-request "http://localhost"; #f read-lines) ;; ((exn http client-error) e (print e))) (set! res (handle-exceptions exn (begin (debug:print 0 "WARNING: failure in with-input-from-request to " fullrul ". Killing associated server to allow clean retry.") (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (hash-table-delete! *runremote* run-id) (tasks:kill-server-run-id run-id) #f) (with-input-from-request ;; was dat fullurl (list (cons 'key "thekey") |
︙ | ︙ | |||
352 353 354 355 356 357 358 | ;; run http-transport:keep-running in a parallel thread to monitor that the db is being ;; used and to shutdown after sometime if it is not. ;; (define (http-transport:keep-running server-id run-id) ;; if none running or if > 20 seconds since ;; server last used then start shutdown ;; This thread waits for the server to come alive | > | | < | < < | < | | | 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 | ;; run http-transport:keep-running in a parallel thread to monitor that the db is being ;; used and to shutdown after sometime if it is not. ;; (define (http-transport:keep-running server-id run-id) ;; if none running or if > 20 seconds since ;; server last used then start shutdown ;; This thread waits for the server to come alive (let* ((tdbdat (tasks:open-db)) (server-info (let loop ((start-time (current-seconds)) (changed #t) (last-sdat "not this")) (let ((sdat #f)) (thread-sleep! 0.01) (debug:print-info 0 "Waiting for server alive signature") (mutex-lock! *heartbeat-mutex*) (set! sdat *server-info*) (mutex-unlock! *heartbeat-mutex*) (if (and sdat (not changed) (> (- (current-seconds) start-time) 2)) sdat (begin (debug:print-info 0 "Still waiting, last-sdat=" last-sdat) (sleep 4) (if (> (- (current-seconds) start-time) 120) ;; been waiting for two minutes (begin (debug:print 0 "ERROR: transport appears to have died, exiting server " server-id " for run " run-id) (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id "failed to start, never received server alive signature") (exit)) (loop start-time (equal? sdat last-sdat) sdat))))))) (iface (car server-info)) (port (cadr server-info)) (last-access 0) (server-timeout (let ((tmo (configf:lookup *configdat* "server" "timeout"))) (if (and (string? tmo) (string->number tmo)) (* 60 60 (string->number tmo)) ;; (* 3 24 60 60) ;; default to three days (* 60 1) ;; default to one minute ;; (* 60 60 25) ;; default to 25 hours )))) (let loop ((count 0) (server-state 'available)) ;; Use this opportunity to sync the inmemdb to db (let ((start-time (current-milliseconds)) (sync-time #f) (rem-time #f)) ;; inmemdb is a dbstruct (if *inmemdb* (db:sync-touched *inmemdb* *run-id* force-sync: #t)) (set! sync-time (- (current-milliseconds) start-time)) (set! rem-time (quotient (- 4000 sync-time) 1000)) (debug:print 2 "SYNC: time= " sync-time ", rem-time=" rem-time) ;; ;; set_running after our first pass through and start the db ;; (if (eq? server-state 'available) (begin (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "dbprep") (thread-sleep! 5) ;; give some margin for queries to complete before switching from file based access to server based access (set! *inmemdb* (db:setup run-id)) (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running"))) (if (and (<= rem-time 4) (> rem-time 0)) (thread-sleep! rem-time) (thread-sleep! 4))) ;; fallback for if the math is changed ... (if (< count 1) ;; 3x3 = 9 secs aprox |
︙ | ︙ | |||
467 468 469 470 471 472 473 | ;; (tasks:server-set-state! tdb server-id "running")) ;; (loop 0 server-state)) (begin (debug:print-info 0 "Starting to shutdown the server.") ;; need to delete only *my* server entry (future use) (set! *time-to-exit* #t) | < | < | > | | | | | | | | | | | | < | | | | | | < | | | | | < | | | | | | | | | | | | | | | | | | | | | | | 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 | ;; (tasks:server-set-state! tdb server-id "running")) ;; (loop 0 server-state)) (begin (debug:print-info 0 "Starting to shutdown the server.") ;; need to delete only *my* server entry (future use) (set! *time-to-exit* #t) (if *inmemdb* (db:sync-touched *inmemdb* *run-id* force-sync: #t)) ;; ;; start_shutdown ;; (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "shutting-down") (portlogger:open-run-close portlogger:set-port port "released") (thread-sleep! 5) (debug:print-info 0 "Max cached queries was " *max-cache-size*) (debug:print-info 0 "Number of cached writes " *number-of-writes*) (debug:print-info 0 "Average cached write time " (if (eq? *number-of-writes* 0) "n/a (no writes)" (/ *writes-total-delay* *number-of-writes*)) " ms") (debug:print-info 0 "Number non-cached queries " *number-non-write-queries*) (debug:print-info 0 "Average non-cached time " (if (eq? *number-non-write-queries* 0) "n/a (no queries)" (/ *total-non-write-delay* *number-non-write-queries*)) " ms") (debug:print-info 0 "Server shutdown complete. Exiting") (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running") (exit)))))) ;; all routes though here end in exit ... ;; ;; start_server? ;; (define (http-transport:launch run-id) (let* ((tdbdat (tasks:open-db))) (set! *run-id* run-id) (if (args:get-arg "-daemonize") (begin (daemon:ize) (if *alt-log-file* ;; we should re-connect to this port, I think daemon:ize disrupts it (begin (current-error-port *alt-log-file*) (current-output-port *alt-log-file*))))) (if (server:check-if-running run-id) (begin (debug:print 0 "INFO: Server for run-id " run-id " already running") (exit 0))) (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id)) (remtries 4)) (if (not server-id) (if (> remtries 0) (begin (thread-sleep! 2) (loop (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id) (- remtries 1))) (begin ;; since we didn't get the server lock we are going to clean up and bail out (debug:print-info 2 "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue") (tasks:server-delete-records-for-this-pid (db:delay-if-busy tdbdat) " http-transport:launch") )) (let* ((th2 (make-thread (lambda () (debug:print-info 0 "Server run thread started") (http-transport:run (if (args:get-arg "-server") (args:get-arg "-server") "-") run-id server-id)) "Server run")) (th3 (make-thread (lambda () (debug:print-info 0 "Server monitor thread started") (http-transport:keep-running server-id run-id)) "Keep running"))) ;; Database connection ;; don't start the db here ;; (set! *inmemdb* (db:setup run-id)) (thread-start! th2) (thread-start! th3) (set! *didsomething* #t) (thread-join! th2) (exit)))))) (define (http-transport:server-signal-handler signum) (signal-mask! signum) (handle-exceptions exn (debug:print " ... exiting ...") (let ((th1 (make-thread (lambda () |
︙ | ︙ |
Modified megatest.scm from [9b64b448cf] to [42ff41d3a5].
︙ | ︙ | |||
545 546 547 548 549 550 551 | ;; MAY STILL NEED THIS ;; (set! *megatest-db* (make-dbr:dbstruct path: *toppath* local: #t)))))))))) (if (or (args:get-arg "-list-servers") (args:get-arg "-stop-server")) (let ((tl (launch:setup-for-run))) (if tl | > | | 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 | ;; MAY STILL NEED THIS ;; (set! *megatest-db* (make-dbr:dbstruct path: *toppath* local: #t)))))))))) (if (or (args:get-arg "-list-servers") (args:get-arg "-stop-server")) (let ((tl (launch:setup-for-run))) (if tl (let* ((tdbdat (tasks:open-db)) (servers (tasks:get-all-servers (db:delay-if-busy tdbdat))) (fmtstr "~5a~12a~8a~20a~24a~10a~10a~10a~10a\n") (servers-to-kill '()) (killinfo (args:get-arg "-stop-server")) (khost-port (if killinfo (if (substring-index ":" killinfo)(string-split ":") #f) #f)) (sid (if killinfo (if (substring-index ":" killinfo) #f (string->number killinfo)) #f))) (format #t fmtstr "Id" "MTver" "Pid" "Host" "Interface:OutPort" "InPort" "LastBeat" "State" "Transport") (format #t fmtstr "==" "=====" "===" "====" "=================" "======" "========" "=====" "=========") |
︙ | ︙ | |||
574 575 576 577 578 579 580 | (killed #f) (status (< last-update 20))) ;; (zmq-sockets (if status (server:client-connect hostname port) #f))) ;; no need to login as status of #t indicates we are connecting to correct ;; server (if (equal? state "dead") (if (> last-update (* 25 60 60)) ;; keep records around for slighly over a day. | | | | 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 | (killed #f) (status (< last-update 20))) ;; (zmq-sockets (if status (server:client-connect hostname port) #f))) ;; no need to login as status of #t indicates we are connecting to correct ;; server (if (equal? state "dead") (if (> last-update (* 25 60 60)) ;; keep records around for slighly over a day. (tasks:server-deregister (db:delay-if-busy tdbdat) hostname pullport: pullport pid: pid action: 'delete)) (if (> last-update 20) ;; Mark as dead if not updated in last 20 seconds (tasks:server-deregister (db:delay-if-busy tdbdat) hostname pullport: pullport pid: pid))) (format #t fmtstr id mt-ver pid hostname (conc interface ":" pullport) pubport last-update (if status "alive" "dead") transport) (if (or (equal? id sid) (equal? sid 0)) ;; kill all/any (begin (debug:print-info 0 "Attempting to stop server with pid " pid) (tasks:kill-server status hostname pullport pid transport))))) |
︙ | ︙ |
Modified rmt.scm from [ec918e30be] to [3dfb2ffd80].
︙ | ︙ | |||
74 75 76 77 78 79 80 | (mutex-unlock! *db-multi-sync-mutex*) (let* ((run-id (if rid rid 0)) (connection-info (let ((cinfo (hash-table-ref/default *runremote* run-id #f))) (if cinfo cinfo ;; NB// can cache the answer for server running for 10 seconds ... ;; ;; (and (not (rmt:write-frequency-over-limit? cmd run-id)) | | > > | 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 | (mutex-unlock! *db-multi-sync-mutex*) (let* ((run-id (if rid rid 0)) (connection-info (let ((cinfo (hash-table-ref/default *runremote* run-id #f))) (if cinfo cinfo ;; NB// can cache the answer for server running for 10 seconds ... ;; ;; (and (not (rmt:write-frequency-over-limit? cmd run-id)) (if (tasks:server-running-or-starting? (db:delay-if-busy (tasks:open-db)) run-id) (let ((res (client:setup run-id))) (if res (hash-table-ref/default *runremote* run-id #f) ;; client:setup filled this in (hopefully) #f)) #f)))) (jparams (db:obj->string params))) (if connection-info |
︙ | ︙ |
Modified runs.scm from [5861640706] to [692dff51df].
︙ | ︙ | |||
212 213 214 215 216 217 218 | (test-records (make-hash-table)) ;; need to process runconfigs before generating these lists (all-tests-registry #f) ;; (tests:get-all)) ;; (tests:get-valid-tests (make-hash-table) test-search-path)) ;; all valid tests to check waiton names (all-test-names #f) ;; (hash-table-keys all-tests-registry)) (test-names #f) ;; (tests:filter-test-names all-test-names test-patts)) (required-tests #f) ;;(lset-intersection equal? (string-split test-patts ",") test-names))) ;; test-names)) ;; Added test-names as initial for required-tests but that failed to work (task-key (conc (hash-table->alist flags) " " (get-host-name) " " (current-process-id))) | | > | | < < | | | 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 | (test-records (make-hash-table)) ;; need to process runconfigs before generating these lists (all-tests-registry #f) ;; (tests:get-all)) ;; (tests:get-valid-tests (make-hash-table) test-search-path)) ;; all valid tests to check waiton names (all-test-names #f) ;; (hash-table-keys all-tests-registry)) (test-names #f) ;; (tests:filter-test-names all-test-names test-patts)) (required-tests #f) ;;(lset-intersection equal? (string-split test-patts ",") test-names))) ;; test-names)) ;; Added test-names as initial for required-tests but that failed to work (task-key (conc (hash-table->alist flags) " " (get-host-name) " " (current-process-id))) (tdbdat (tasks:open-db))) (set-signal-handler! signal/int (lambda (signum) (signal-mask! signum) (print "Received signal " signum ", cleaning up before exit. Please wait...") (let ((tdbdat (tasks:open-db))) (tasks:set-state-given-param-key (db:delay-if-busy tdbdat) task-key "killed")) (print "Killed by signal " signum ". Exiting") (exit))) ;; register this run in monitor.db (tasks:add (db:delay-if-busy tdbdat) "run-tests" user target runname test-patts task-key) ;; params) (tasks:set-state-given-param-key (db:delay-if-busy tdbdat) task-key "running") (runs:set-megatest-env-vars run-id inkeys: keys inrunname: runname) ;; these may be needed by the launching process (if (file-exists? runconfigf) (setup-env-defaults runconfigf run-id *already-seen-runconfig-info* keyvals target) (debug:print 0 "WARNING: You do not have a run config file: " runconfigf)) ;; Now generate all the tests lists (set! all-tests-registry (tests:get-all)) |
︙ | ︙ | |||
391 392 393 394 395 396 397 | (if (not (hash-table-ref/default flags "-preclean" #f)) (hash-table-set! flags "-preclean" #t)) (if (not (hash-table-ref/default flags "-rerun" #f)) (hash-table-set! flags "-rerun" "STUCK/DEAD,n/a,ZERO_ITEMS")) (runs:run-tests target runname test-patts user flags run-count: (- run-count 1))))) (debug:print-info 0 "No tests to run"))) (debug:print-info 4 "All done by here") | | | > | 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 | (if (not (hash-table-ref/default flags "-preclean" #f)) (hash-table-set! flags "-preclean" #t)) (if (not (hash-table-ref/default flags "-rerun" #f)) (hash-table-set! flags "-rerun" "STUCK/DEAD,n/a,ZERO_ITEMS")) (runs:run-tests target runname test-patts user flags run-count: (- run-count 1))))) (debug:print-info 0 "No tests to run"))) (debug:print-info 4 "All done by here") (tasks:set-state-given-param-key (db:delay-if-busy tdbdat) task-key "done") ;; (sqlite3:finalize! tasks-db) )) ;; loop logic. These are used in runs:run-tests-queue to make it a bit more readable. ;; ;; If reg not full and have items in tal then loop with (car tal)(cdr tal) reg reruns ;; If reg is full (i.e. length >= n ;; loop with (car reg) tal (cdr reg) reruns |
︙ | ︙ | |||
1395 1396 1397 1398 1399 1400 1401 | ;; 'set-state-status ;; ;; NB// should pass in keys? ;; (define (runs:operate-on action target runnamepatt testpatt #!key (state #f)(status #f)(new-state-status #f)(remove-data-only #f)) (common:clear-caches) ;; clear all caches (let* ((db #f) | | | 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 | ;; 'set-state-status ;; ;; NB// should pass in keys? ;; (define (runs:operate-on action target runnamepatt testpatt #!key (state #f)(status #f)(new-state-status #f)(remove-data-only #f)) (common:clear-caches) ;; clear all caches (let* ((db #f) (tdbdat (tasks:open-db)) (keys (rmt:get-keys)) (rundat (mt:get-runs-by-patt keys runnamepatt target)) (header (vector-ref rundat 0)) (runs (vector-ref rundat 1)) (states (if state (string-split state ",") '())) (statuses (if status (string-split status ",") '())) (state-status (if (string? new-state-status) (string-split new-state-status ",") '(#f #f)))) |
︙ | ︙ | |||
1434 1435 1436 1437 1438 1439 1440 | (debug:print-info 4 "runs:operate-on run=" run ", header=" header) (if (not (null? tests)) (begin (case action ((remove-runs) ;; seek and kill in flight -runtests with % as testpatt here (if (equal? testpatt "%") | | | 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 | (debug:print-info 4 "runs:operate-on run=" run ", header=" header) (if (not (null? tests)) (begin (case action ((remove-runs) ;; seek and kill in flight -runtests with % as testpatt here (if (equal? testpatt "%") (tasks:kill-runner (db:delay-if-busy tdbdat) target run-name) (debug:print 0 "not attempting to kill any run launcher processes as testpatt is " testpatt)) (debug:print 1 "Removing tests for run: " runkey " " (db:get-value-by-header run header "runname"))) ((set-state-status) (debug:print 1 "Modifying state and staus for tests for run: " runkey " " (db:get-value-by-header run header "runname"))) ((print-run) (debug:print 1 "Printing info for run " runkey ", run=" run ", tests=" tests ", header=" header) action) |
︙ | ︙ | |||
1547 1548 1549 1550 1551 1552 1553 | ;; (if (null? (glob (conc runpath "/*"))) ;; (begin ;; (debug:print 1 "Removing run dir " runpath) ;; (system (conc "rmdir -p " runpath)))) ))))) )) runs) | | > | 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 | ;; (if (null? (glob (conc runpath "/*"))) ;; (begin ;; (debug:print 1 "Removing run dir " runpath) ;; (system (conc "rmdir -p " runpath)))) ))))) )) runs) ;; (sqlite3:finalize! (db:delay-if-busy tdbdat)) ) #t) (define (runs:remove-test-directory db test remove-data-only) (let* ((run-dir (db:test-get-rundir test)) ;; run dir is from the link tree (real-dir (if (file-exists? run-dir) (resolve-pathname run-dir) #f))) |
︙ | ︙ |
Modified server.scm from [faceda817c] to [13f9300039].
︙ | ︙ | |||
125 126 127 128 129 130 131 | ;; (define (server:try-running run-id) (if (eq? run-id 0) (server:run run-id) (rmt:start-server run-id))) (define (server:check-if-running run-id) | > | | | | > | | | | | | | | | 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 | ;; (define (server:try-running run-id) (if (eq? run-id 0) (server:run run-id) (rmt:start-server run-id))) (define (server:check-if-running run-id) (let ((tdbdat (tasks:open-db))) (let loop ((server (tasks:get-server (db:delay-if-busy tdbdat) run-id)) (trycount 0)) (if server ;; note: client:start will set *runremote*. this needs to be changed ;; also, client:start will login to the server, also need to change that. ;; ;; client:start returns #t if login was successful. ;; (let ((res (server:ping-server run-id (tasks:hostinfo-get-interface server) (tasks:hostinfo-get-port server)))) ;; if the server didn't respond we must remove the record (if res #t (begin (debug:print-info 0 "server at " server " not responding, removing record") (tasks:server-force-clean-running-records-for-run-id (db:delay-if-busy tdbdat) run-id " server:check-if-running") res))) #f)))) ;; called in megatest.scm, host-port is string hostname:port ;; (define (server:ping run-id host:port) (let ((tdbdat (tasks:open-db))) (let* ((host-port (let ((slst (string-split host:port ":"))) (if (eq? (length slst) 2) (list (car slst)(string->number (cadr slst))) #f))) (toppath (launch:setup-for-run)) (server-db-dat (if (not host-port)(tasks:get-server (db:delay-if-busy tdbdat) run-id) #f))) (if (not run-id) (begin (debug:print 0 "ERROR: must specify run-id when doing ping, -run-id n") (print "ERROR: No run-id") (exit 1)) (if (and (not host-port) (not server-db-dat)) (begin (print "ERROR: bad host:port") (exit 1)) (let* ((iface (if host-port (car host-port) (tasks:hostinfo-get-interface server-db-dat))) (port (if host-port (cadr host-port)(tasks:hostinfo-get-port server-db-dat))) (server-dat (http-transport:client-connect iface port)) (login-res (rmt:login-no-auto-client-setup server-dat run-id))) (if (and (list? login-res) (car login-res)) (begin (print "LOGIN_OK") (exit 0)) (begin (print "LOGIN_FAILED") (exit 1))))))))) ;; run ping in separate process, safest way in some cases ;; (define (server:ping-server run-id iface port) (with-input-from-pipe (conc (common:get-megatest-exe) " -run-id " run-id " -ping " (conc iface ":" port)) (lambda () |
︙ | ︙ |
Modified tasks.scm from [059408bffa] to [c8e0f86792].
︙ | ︙ | |||
49 50 51 52 53 54 55 | (define (tasks:get-task-db-path) (if *task-db* (vector-ref *task-db* 1) (let* ((linktree (configf:lookup *configdat* "setup" "linktree")) (dbpath (conc linktree "/.db/monitor.db"))) dbpath))) | < < < > > | | | | | | | | | | | | | | | | | | | | | | | | | < | < | < < < < < | | 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 | (define (tasks:get-task-db-path) (if *task-db* (vector-ref *task-db* 1) (let* ((linktree (configf:lookup *configdat* "setup" "linktree")) (dbpath (conc linktree "/.db/monitor.db"))) dbpath))) ;; If file exists AND ;; file readable ;; ==> open it ;; If file exists AND ;; file NOT readable ;; ==> open in-mem version ;; If file NOT exists ;; ==> open in-mem version ;; (define (tasks:open-db) (if *task-db* *task-db* (let* ((dbpath (tasks:get-task-db-path)) (avail (tasks:wait-on-journal dbpath 10)) ;; wait up to about 10 seconds for the journal to go away (exists (file-exists? dbpath)) (write-access (file-write-access? dbpath)) (mdb (cond ((file-write-access? *toppath*)(sqlite3:open-database dbpath)) ((file-read-access? dbpath) (sqlite3:open-database dbpath)) (else (sqlite3:open-database ":memory:")))) ;; (never-give-up-open-db dbpath)) (handler (make-busy-timeout 36000))) (if (and exists (not write-access)) (set! *db-write-access* write-access)) ;; only unset so other db's also can use this control (sqlite3:set-busy-handler! mdb handler) (sqlite3:execute mdb (conc "PRAGMA synchronous = 0;")) (if (or (and (not exists) (file-write-access? *toppath*)) (not (file-read-access? dbpath))) (begin (sqlite3:execute mdb "CREATE TABLE IF NOT EXISTS tasks_queue (id INTEGER PRIMARY KEY, action TEXT DEFAULT '', owner TEXT, state TEXT DEFAULT 'new', target TEXT DEFAULT '', name TEXT DEFAULT '', testpatt TEXT DEFAULT '', keylock TEXT, params TEXT, creation_time TIMESTAMP, execution_time TIMESTAMP);") (sqlite3:execute mdb "CREATE TABLE IF NOT EXISTS monitors (id INTEGER PRIMARY KEY, pid INTEGER, start_time TIMESTAMP, last_update TIMESTAMP, hostname TEXT, username TEXT, CONSTRAINT monitors_constraint UNIQUE (pid,hostname));") (sqlite3:execute mdb "CREATE TABLE IF NOT EXISTS servers (id INTEGER PRIMARY KEY, pid INTEGER, interface TEXT, hostname TEXT, port INTEGER, pubport INTEGER, start_time TIMESTAMP, priority INTEGER, state TEXT, mt_version TEXT, heartbeat TIMESTAMP, transport TEXT, run_id INTEGER);") ;; CONSTRAINT servers_constraint UNIQUE (pid,hostname,port));") (sqlite3:execute mdb "CREATE TABLE IF NOT EXISTS clients (id INTEGER PRIMARY KEY, server_id INTEGER, pid INTEGER, hostname TEXT, cmdline TEXT, login_time TIMESTAMP, logout_time TIMESTAMP DEFAULT -1, CONSTRAINT clients_constraint UNIQUE (pid,hostname));") )) (set! *task-db* (cons mdb dbpath)) *task-db*))) ;;====================================================================== ;; Server and client management ;;====================================================================== ;; make-vector-record tasks hostinfo id interface port pubport transport pid hostname (define (tasks:hostinfo-get-id vec) (vector-ref vec 0)) (define (tasks:hostinfo-get-interface vec) (vector-ref vec 1)) |
︙ | ︙ | |||
350 351 352 353 354 355 356 | (system (conc "nbfake kill " pid)) (unsetenv "TARGETHOST_LOGF") (unsetenv "TARGETHOST")) ;; look up a server by run-id and send it a kill, also delete the record for that server ;; (define (tasks:kill-server-run-id run-id #!key (tag "default")) | | | | | > | 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 | (system (conc "nbfake kill " pid)) (unsetenv "TARGETHOST_LOGF") (unsetenv "TARGETHOST")) ;; look up a server by run-id and send it a kill, also delete the record for that server ;; (define (tasks:kill-server-run-id run-id #!key (tag "default")) (let* ((tdbdat (tasks:open-db)) (sdat (tasks:get-server (db:delay-if-busy tdbdat) run-id))) (if sdat (let ((hostname (vector-ref sdat 6)) (pid (vector-ref sdat 5)) (server-id (vector-ref sdat 0))) (debug:print-info 0 "Killing server " server-id " for run-id " run-id " on host " hostname " with pid " pid) (tasks:kill-server hostname pid) (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id tag) ) (debug:print-info 0 "No server found for run-id " run-id ", nothing to kill")) ;; (sqlite3:finalize! tdb) )) ;; (if status ;; #t means alive ;; (begin ;; (if (equal? hostname (get-host-name)) ;; (handle-exceptions ;; exn ;; (debug:print-info 0 "server may or may not be dead, check for megatest -server running as pid " pid "\n" |
︙ | ︙ |