Changes In Branch v1.81-adjutant Excluding Merge-Ins
This is equivalent to a diff from 98f3441b4f to 13060ce126
2024-08-19
| ||
11:42 | CI/CD: Automated commit after successful test, build, and deploy for v1.81-fix-extract-scripts check-in: 29155bc147 user: fdiskadm tags: v1.81 | |
2024-08-18
| ||
23:21 | CI/CD: Automated commit after successful test, build, and deploy for v1.81-adjutant check-in: b939ba890d user: ramartin tags: v1.81-fix-extract-scripts | |
2024-08-16
| ||
13:34 | Patched forward the adjutant code that got lost in v1.65 Leaf check-in: 13060ce126 user: matt tags: v1.81-adjutant | |
2024-08-13
| ||
12:55 | removed extra copy of launch:extract-scripts-logpro and corrected it to add .logpro to the logpro filenames check-in: e829926867 user: mmgraham tags: v1.81-fix-extract-scripts | |
2024-07-17
| ||
19:21 | Changed Megatest version to v1.8181 check-in: a748f29739 user: icfadm tags: v1.81 | |
19:17 | Lower gating on test launch to 0.05 journal load. Add exception handler for file-modification-time on .servinfo files check-in: 7c315bd32d user: mrwellan tags: v1.81-fixes | |
17:13 | Move sync transaction in an attempt to free up bound time in .mtdb/*.db files Leaf check-in: fab9bf9c5c user: mrwellan tags: v1.81-better sync | |
2024-07-15
| ||
15:47 | Changed Megatest version to 1.8102 check-in: 98f3441b4f user: icfadm tags: v1.81 | |
15:12 | CI/CD: Automated commit after successful test, build, and deploy for v1.81-bump-server-load check-in: 1fff14fbea user: fdiskadm tags: v1.81 | |
Modified Makefile from [d32576ac8f] to [b3bc6cb258].
︙ | ︙ | |||
34 35 36 37 38 39 40 | process.scm runs.scm tasks.scm tests.scm genexample.scm \ tdb.scm mt.scm \ ezsteps.scm rmt.scm api.scm \ subrun.scm archive.scm env.scm \ diff-report.scm cgisetup/models/pgdb.scm # module source files | | < | 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 | process.scm runs.scm tasks.scm tests.scm genexample.scm \ tdb.scm mt.scm \ ezsteps.scm rmt.scm api.scm \ subrun.scm archive.scm env.scm \ diff-report.scm cgisetup/models/pgdb.scm # module source files MSRCFILES = dbfile.scm debugprint.scm mtargs.scm commonmod.scm dbmod.scm adjutant.scm mutils.scm mttop.scm tcp-transportmod.scm rmtmod.scm portlogger.scm transport-mode.scm : transport-mode.scm.template cp transport-mode.scm.template transport-mode.scm dashboard-transport-mode.scm : dashboard-transport-mode.scm.template cp dashboard-transport-mode.scm.template dashboard-transport-mode.scm |
︙ | ︙ |
Modified adjutant.scm from [7560fecb1c] to [d6c67b1549].
︙ | ︙ | |||
20 21 22 23 24 25 26 | (declare (unit adjutant)) (module adjutant * (import scheme chicken data-structures extras files) (import (prefix sqlite3 sqlite3:) posix typed-records srfi-18 srfi-69 | | | | > > > > > > > > > > > | 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 | (declare (unit adjutant)) (module adjutant * (import scheme chicken data-structures extras files) (import (prefix sqlite3 sqlite3:) posix typed-records srfi-18 srfi-69 md5 message-digest matchable regex srfi-1) (define (adjutant-run host-type rmt:no-sync-take-job) (print "Running the adjutant!") (let loop ((wait-count 0)) (if (< wait-count 10) ;; 6 x 10 seconds = one minute (let* ((dat (rmt:no-sync-take-job host-type))) (match dat ((id ht vars exekey cmdline state event-time last-update) (system cmdline) (loop 0)) (else (thread-sleep! 10) (loop (+ wait-count 1))))) (print "I'm bored. Exiting.")))) ) |
Modified api.scm from [0ec121fad4] to [7f68d2f308].
︙ | ︙ | |||
388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 | ((tasks-get-last) (apply tasks:get-last dbstruct params)) ;; NO SYNC DB ((no-sync-set) (apply db:no-sync-set *no-sync-db* params)) ((no-sync-get/default) (apply db:no-sync-get/default *no-sync-db* params)) ((no-sync-del!) (apply db:no-sync-del! *no-sync-db* params)) ((no-sync-get-lock) (apply db:no-sync-get-lock *no-sync-db* params)) ;; NO SYNC DB PROCESSES ((register-process) (apply dbfile:register-process *no-sync-db* params)) ((set-process-done) (apply dbfile:set-process-done *no-sync-db* params)) ((set-process-status) (apply dbfile:set-process-status *no-sync-db* params)) ((get-process-options) (apply dbfile:get-process-options *no-sync-db* params)) ;; ARCHIVES ;; ((archive-get-allocations) ((archive-register-disk) (apply db:archive-register-disk dbstruct params)) ((archive-register-block-name)(apply db:archive-register-block-name dbstruct params)) ;; ((archive-allocate-testsuite/area-to-block)(apply db:archive-allocate-testsuite/area-to-block dbstruct block-id testsuite-name areakey)) | > > > | | 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 | ((tasks-get-last) (apply tasks:get-last dbstruct params)) ;; NO SYNC DB ((no-sync-set) (apply db:no-sync-set *no-sync-db* params)) ((no-sync-get/default) (apply db:no-sync-get/default *no-sync-db* params)) ((no-sync-del!) (apply db:no-sync-del! *no-sync-db* params)) ((no-sync-get-lock) (apply db:no-sync-get-lock *no-sync-db* params)) ((no-sync-add-job) (apply db:no-sync-add-job *no-sync-db* params)) ((no-sync-take-job) (apply db:no-sync-take-job *no-sync-db* params)) ((no-sync-job-records-clean) (apply db:no-sync-job-records-clean *no-sync-db* params)) ;; NO SYNC DB PROCESSES ((register-process) (apply dbfile:register-process *no-sync-db* params)) ((set-process-done) (apply dbfile:set-process-done *no-sync-db* params)) ((set-process-status) (apply dbfile:set-process-status *no-sync-db* params)) ((get-process-options) (apply dbfile:get-process-options *no-sync-db* params)) ;; ARCHIVES ;; ((archive-get-allocations) ((archive-register-disk) (apply db:archive-register-disk dbstruct params)) ((archive-register-block-name)(apply db:archive-register-block-name dbstruct params)) ;; ((archive-allocate-testsuite/area-to-block)(apply db:archive-allocate-testsuite/area-to-block dbstruct block-id testsuite-name areakey)) ;;====================================================================== ;; READ ONLY QUERIES ;;====================================================================== ;; KEYS ((get-key-val-pairs) (apply db:get-key-val-pairs dbstruct params)) ((get-keys) (db:get-keys dbstruct)) |
︙ | ︙ |
Modified common.scm from [dd0c23fb98] to [c6c75a6980].
︙ | ︙ | |||
2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 | ;; ;; [hosts] ;; arm cubie01 cubie02 ;; x86_64 zeus xena myth01 ;; allhosts #{g hosts arm} #{g hosts x86_64} ;; ;; [host-types] ;; general #MTLOWESTLOAD #{g hosts allhosts} ;; arm #MTLOWESTLOAD #{g hosts arm} ;; nbgeneral nbjob run JOBCOMMAND -log $MT_LINKTREE/$MT_TARGET/$MT_RUNNAME.$MT_TESTNAME-$MT_ITEM_PATH.lgo ;; ;; [host-rules] ;; # maxnload => max normalized load ;; # maxnjobs => max jobs per cpu ;; # maxjobrate => max jobs per second ;; general maxnload=1.1; maxnjobs=1.2; maxjobrate=0.1 ;; ;; [launchers] ;; envsetup general | > > > > | > | > > > | | | > > > > | | 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 | ;; ;; [hosts] ;; arm cubie01 cubie02 ;; x86_64 zeus xena myth01 ;; allhosts #{g hosts arm} #{g hosts x86_64} ;; ;; [host-types] ;; C/M/A lets megatest know this launcher provides C cores, M bytes memory for architecture A ;; 2/2G/arm smart -cores 2 -memory 2G -arch arm ;; general #MTLOWESTLOAD #{g hosts allhosts} ;; arm #MTLOWESTLOAD #{g hosts arm} ;; nbgeneral nbjob run JOBCOMMAND -log $MT_LINKTREE/$MT_TARGET/$MT_RUNNAME.$MT_TESTNAME-$MT_ITEM_PATH.lgo ;; ;; NOTE: host-rules is ONLY used for MTLOWESTLOAD ;; ;; [host-rules] ;; # maxnload => max normalized load ;; # maxnjobs => max jobs per cpu ;; # maxjobrate => max jobs per second ;; general maxnload=1.1; maxnjobs=1.2; maxjobrate=0.1 ;; ;; [launchers] ;; envsetup general ;; xor/%/n 2/2G/arm ;; % nbgeneral ;; ;; [jobtools] ;; # if defined and not "no" flexi-launcher will bypass "launcher" unless no match. ;; flexi-launcher yes ;; launcher nbfake ;; mode adjutant|normal (default is normal) ;; ;; ;; mode is 'normal (i.e. directly use launcher) or 'adjutant (i.e. use adjutant) ;; (define (common:get-launcher configdat testname itempath mode) (let ((fallback-launcher (configf:lookup configdat "jobtools" "launcher"))) (if (and (configf:lookup configdat "jobtools" "flexi-launcher") ;; overrides launcher (not (equal? (configf:lookup configdat "jobtools" "flexi-launcher") "no"))) (let* ((launchers (hash-table-ref/default configdat "launchers" '()))) (if (null? launchers) fallback-launcher (let loop ((hed (car launchers)) (tal (cdr launchers))) (let ((patt (car hed)) (host-type (cadr hed))) (if (tests:match patt testname itempath) ;; have a launcher match for this test (begin (debug:print-info 2 *default-log-port* "Have flexi-launcher match for " testname "/" itempath " = " host-type) (let ((launcher (configf:lookup configdat "host-types" host-type))) ;; find the actual launcher from the host-types table ;; if we are in adjutant mode then we want to return both host-type and launcher (if launcher (let* ((launcher-parts (string-split launcher)) (launcher-exe (car launcher-parts))) (if (equal? launcher-exe "#MTLOWESTLOAD") ;; this is our special case, we will find the lowest load and craft a nbfake commandline (let host-loop ((targ-host (common:get-least-loaded-host (cdr launcher-parts) host-type configdat)) (count 100)) (if targ-host (conc "remrun " targ-host) (if (> count 0) (begin (debug:print 0 *default-log-port* "INFO: Waiting for a host for host-type " host-type) (thread-sleep! (- 101 count)) (host-loop (common:get-least-loaded-host (cdr launcher-parts) host-type configdat) (- count 1))) (begin (debug:print 0 *default-log-port* "FATAL: Failed to find a host from #MTLOWESTLOAD for host-type " host-type) (exit))))) (case mode ((adjutant) (list host-type launcher)) (else launcher)))) (begin (debug:print-info 0 *default-log-port* "WARNING: no launcher found for host-type " host-type) (if (null? tal) fallback-launcher (loop (car tal)(cdr tal))))))) ;; no match, try again (if (null? tal) |
︙ | ︙ |
Modified db.scm from [346b188c56] to [52f3f2dced].
︙ | ︙ | |||
1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 | ;; (db:delay-if-busy dbdat) (sqlite3:execute db "VACUUM;") dead-runs)) ;;====================================================================== ;; no-sync.db - small bits of data to be shared between servers ;;====================================================================== (define (db:get-dbsync-path) (case (rmt:transport-mode) ((http)(common:make-tmpdir-name *toppath* "")) ((tcp) (conc *toppath*"/.mtdb")) ((nfs) (conc *toppath*"/.mtdb")) (else "/tmp/dunno-this-gonna-exist"))) ;; This is needed for api.scm (define (db:open-no-sync-db) (dbfile:open-no-sync-db (db:get-dbsync-path))) ;; why get the keys from the db? why not get from the *configdat* ;; using keys:config-get-fields? | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 | ;; (db:delay-if-busy dbdat) (sqlite3:execute db "VACUUM;") dead-runs)) ;;====================================================================== ;; no-sync.db - small bits of data to be shared between servers ;;====================================================================== ;; if we are not a server create a db handle. this is not finalized ;; so watch for problems. I'm still not clear if it is needed to manually ;; finalize sqlite3 dbs with the sqlite3 egg. ;; (define (db:no-sync-db db-in) (mutex-lock! *db-access-mutex*) (let ((res (if db-in db-in (let ((db (db:open-no-sync-db))) (set! *no-sync-db* db) db)))) (mutex-unlock! *db-access-mutex*) res)) (define (db:get-dbsync-path) (case (rmt:transport-mode) ((http)(common:make-tmpdir-name *toppath* "")) ((tcp) (conc *toppath*"/.mtdb")) ((nfs) (conc *toppath*"/.mtdb")) (else "/tmp/dunno-this-gonna-exist"))) (define (db:no-sync-add-job db-in host-type vars-list exekey cmdline) (sqlite3:execute (db:no-sync-db db-in) "INSERT INTO jobs_queue (host_type,vars,exekey,cmdline,state,event_time,last_update) VALUES (?,?,?,?,?,?,?);" host-type (with-output-to-string (lambda () (write vars-list))) exekey cmdline "waiting" (current-seconds)(current-seconds))) ;; find next job (waiting longest) that matches host-type - future, we'll find jobs that fit if no exact match (define (db:no-sync-take-job db-in host-type) (let* ((db (db:no-sync-db db-in)) (stmt1 "SELECT id,host_type,vars,exekey,cmdline,state,event_time,last_update FROM jobs_queue WHERE host_type=? AND state != 'taken' ORDER BY event_time ASC;") (stmt1h (sqlite3:prepare db stmt1)) (stmt2 "UPDATE jobs_queue SET state='taken',last_update=? WHERE id=?;") (stmt2h (sqlite3:prepare db stmt2)) (res (sqlite3:with-transaction db (lambda () (let* ((matching-jobs (sqlite3:fold-row (lambda (res . row) ;; id host-type vars exekey state event-time last-update) (cons row res)) '() stmt1h host-type))) (if (null? matching-jobs) #f (let ((choosen-one (let loop ((tal matching-jobs) (res #f)) ;; put bestest one in here (if (null? tal) res (let ((curr (car tal)) (rem (cdr tal))) curr) ;; here we will compare with res, if better candidate the loop with curr else loop with res )))) (if choosen-one ;; we need to mark it as taken (sqlite3:execute stmt2h (current-seconds) (car choosen-one))) choosen-one))))))) (sqlite3:finalize! stmt1h) ;; it'd be nice to cache these and finalize on exit. (sqlite3:finalize! stmt2h) res)) ;; clean out old jobs in queue, i.e. taken and event_time > 24 hrs ago ;; (define (db:no-sync-job-records-clean db) (sqlite3:execute (db:no-sync-db db) "DELETE FROM jobs_queue WHERE state='taken' AND event_time < ?;" (- (current-seconds)(* 24 3600)))) (define (db:no-sync-get/default db-in var default) (let ((db (db:no-sync-db db-in)) (res default)) (sqlite3:for-each-row (lambda (val) (set! res val)) (db:no-sync-db db) "SELECT val FROM no_sync_metadat WHERE var=?;" var) (if res (let ((newres (if (string? res) (string->number res) #f))) (if newres newres res)) res))) ;; This is needed for api.scm (define (db:open-no-sync-db) (dbfile:open-no-sync-db (db:get-dbsync-path))) ;; why get the keys from the db? why not get from the *configdat* ;; using keys:config-get-fields? |
︙ | ︙ |
Modified dbfile.scm from [fd3c73f7ce] to [32ab28635d].
︙ | ︙ | |||
548 549 550 551 552 553 554 555 556 557 558 559 560 561 | ;; I have been having trouble with init of no-sync.db so ;; doing the init in a transaction every time (no gating ;; on file existance. (for-each (lambda (stmt) (sqlite3:execute db stmt)) (list "CREATE TABLE IF NOT EXISTS no_sync_metadat (var TEXT, val TEXT, CONSTRAINT no_sync_metadat_constraint UNIQUE (var));" "CREATE TABLE IF NOT EXISTS no_sync_locks (key TEXT, val TEXT, | > > > > > > > > > > > > > > > > | 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 | ;; I have been having trouble with init of no-sync.db so ;; doing the init in a transaction every time (no gating ;; on file existance. (for-each (lambda (stmt) (sqlite3:execute db stmt)) (list "CREATE TABLE IF NOT EXISTS jobs_queue (id INTEGER PRIMARY KEY, host_type TEXT, cores INTEGER, memory TEXT, vars TEXT, exekey TEXT, cmdline TEXT, state TEXT, event_time INTEGER, last_update INTEGER);" "CREATE TABLE IF NOT EXISTS test_extra_data (id INTEGER PRIMARY KEY, run_id INTEGER, test_id INTEGER, last_seen_running INTEGER);" "CREATE TABLE IF NOT EXISTS no_sync_metadat (var TEXT, val TEXT, CONSTRAINT no_sync_metadat_constraint UNIQUE (var));" "CREATE TABLE IF NOT EXISTS no_sync_locks (key TEXT, val TEXT, |
︙ | ︙ |
Modified launch.scm from [053403603a] to [b2baab613e].
︙ | ︙ | |||
1551 1552 1553 1554 1555 1556 1557 | (else #f)))) (when do-scan? (debug:print 1 *default-log-port* "INFO: search and mark zombie tests") (rmt:set-var key (current-seconds)) (rmt:find-and-mark-incomplete run-id #f)))) | | > > > > > > > > | < < < < < < < < < < < < | > > > < | | | | | | | | | > > > > > > > | 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 | (else #f)))) (when do-scan? (debug:print 1 *default-log-port* "INFO: search and mark zombie tests") (rmt:set-var key (current-seconds)) (rmt:find-and-mark-incomplete run-id #f)))) (defstruct launch:ajt (vars '()) (exekey #f) (host-type #f) (test-sig #f) (cmdline #f)) ;; append vars (define (launch:ajt-add-vars dat vars) (launch:ajt-vars-set! dat (append (launch:ajt-vars dat) vars))) ;; 1. look though disks list for disk with most space ;; 2. create run dir on disk, path name is meaningful ;; 3. create link from run dir to megatest runs area ;; 4. remotely run the test on allocated host ;; - could be ssh to host from hosts table (update regularly with load) ;; - could be netbatch ;; (launch-test db (cadr status) test-conf)) (define (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat params) (assert runname "FATAL: launch-test called with no runname") (mutex-lock! *launch-setup-mutex*) ;; setting variables and processing the testconfig is NOT thread-safe, reuse the launch-setup mutex (let* (;; locking code removed from here commented out and pasted at end of file (item-path (item-list->path itemdat)) (contour #f) ;; NOT READY FOR THIS (args:get-arg "-contour"))) ;; launcher-mode will be 'adjutant or 'normal (launcher-mode (string->symbol (or (configf:lookup *configdat* "jobtools" "mode") "normal"))) (ajtdat (make-launch:ajt))) (let loop ((delta (- (current-seconds) *last-launch*)) (launch-delay (configf:lookup-number *configdat* "setup" "launch-delay" default: 0))) (if (> launch-delay delta) (begin ;; (if (common:low-noise-print 1200 "test launch delay") ;; every two hours or so remind the user about launch delay. ;; (debug:print-info 0 *default-log-port* "NOTE: test launches are delayed by " launch-delay " seconds. See megatest.config launch-delay setting to adjust.")) ;; launch of " test-name " for " (- launch-delay delta) " seconds")) (thread-sleep! (- launch-delay delta)) (loop (- (current-seconds) *last-launch*) launch-delay)))) (change-directory *toppath*) (let ((var-list (append (list (list "MT_RUN_AREA_HOME" *toppath*) (list "MT_TEST_NAME" test-name) (list "MT_RUNNAME" runname) (list "MT_ITEMPATH" item-path) (list "MT_CONTOUR" contour) ) itemdat))) ;; consolidate this code with the code in megatest.scm for ;; "-execute", *maybe* - the longer they are set the longer ;; each launch takes (must be non-overlapping with the vars) (alist->env-vars var-list) ;; the var-list into the ajtdat adjutant record whether it is needed or not. (launch:ajt-add-vars ajtdat var-list)) (let* ((tregistry (tests:get-all)) ;; third param (below) is system-allowed ;; for tconfig, why do we allow fallback to test-conf? (tconfig (or (tests:get-testconfig test-name item-path tregistry #t force-create: #t) (begin (debug:print 0 *default-log-port* "WARNING: falling back to pre-calculated testconfig. This is likely not desired.") test-conf))) ;; force re-read now that all vars are set (useshell (let ((ush (configf:lookup *configdat* "jobtools" "useshell"))) |
︙ | ︙ | |||
1619 1620 1621 1622 1623 1624 1625 | (subrun (> (length (hash-table-ref/default tconfig "subrun" '())) 0)) ;; send a flag to process a subrun ;; (diskspace (configf:lookup tconfig "requirements" "diskspace")) ;; (memory (configf:lookup tconfig "requirements" "memory")) ;; (hosts (configf:lookup *configdat* "jobtools" "workhosts")) ;; I'm pretty sure this was never completed (remote-megatest (configf:lookup *configdat* "setup" "executable")) (run-time-limit (or (configf:lookup tconfig "requirements" "runtimelim") (configf:lookup *configdat* "setup" "runtimelim"))) | < < < < < < < > | | < < < < < > > | > > > > > > > > > | | 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 | (subrun (> (length (hash-table-ref/default tconfig "subrun" '())) 0)) ;; send a flag to process a subrun ;; (diskspace (configf:lookup tconfig "requirements" "diskspace")) ;; (memory (configf:lookup tconfig "requirements" "memory")) ;; (hosts (configf:lookup *configdat* "jobtools" "workhosts")) ;; I'm pretty sure this was never completed (remote-megatest (configf:lookup *configdat* "setup" "executable")) (run-time-limit (or (configf:lookup tconfig "requirements" "runtimelim") (configf:lookup *configdat* "setup" "runtimelim"))) (local-megatest (common:find-local-megatest)) (launcher (let ((l (common:get-launcher *configdat* test-name item-path launcher-mode))) (if (string? l) (string-split l) l))) ;; some nonhomogenuity here. '(cmd param1 param2 ...) OR '(host-type launcher) ;; (item-list->path itemdat))) ;; test-path is the full path including the item-path (test-sig (conc (common:get-testsuite-name) ":" test-name ":" item-path)) (work-area #f) (toptest-work-area #f) ;; for iterated tests the top test contains data relevant for all (diskpath #f) (cmdparms #f) (fullcmd #f) ;; (define a (with-output-to-string (lambda ()(write x)))) (mt-bindir-path #f) (testinfo (rmt:get-test-info-by-id run-id test-id)) (mt_target (string-intersperse (map cadr keyvals) "/")) (debug-param (append (if (args:get-arg "-debug") (list "-debug" (args:get-arg "-debug")) '()) (if (args:get-arg "-logging")(list "-logging") '()) (if (configf:lookup *configdat* "misc" "profilesw") (list (configf:lookup *configdat* "misc" "profilesw")) '())))) ;; save the test-sig in the ajtdat record (launch:ajt-test-sig-set! ajtdat test-sig) ;; go ahead and figure out if we have a host-type from the ;; launcher call above and save it in the ajtdat record (if (and (eq? launcher-mode 'adjutant) (list? launcher) (> (length launcher) 1)) (launch:ajt-host-type-set! ajtdat (car launcher))) ;; (if hosts (set! hosts (string-split hosts))) ;; set the megatest to be called on the remote host (if (not remote-megatest)(set! remote-megatest local-megatest)) ;; "megatest")) (set! mt-bindir-path (pathname-directory remote-megatest)) ;; (if launcher (set! launcher (string-split launcher))) ;; yuk! ;; set up the run work area for this test (if (and (args:get-arg "-preclean") ;; user has requested to preclean for this run (not (member (db:test-get-rundir testinfo)(list "n/a" "/tmp/badname")))) ;; n/a is a placeholder and thus not a read dir (begin (debug:print-info 0 *default-log-port* "attempting to preclean directory " (db:test-get-rundir testinfo) " for test " test-name "/" item-path) (runs:remove-test-directory testinfo 'remove-data-only))) ;; remove data only, do not perturb the record |
︙ | ︙ | |||
1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 | (list 'target mt_target) (list 'contour contour) (list 'runtlim (if run-time-limit (common:hms-string->seconds run-time-limit) #f)) (list 'env-ovrd (hash-table-ref/default *configdat* "env-override" '())) (list 'set-vars (if params (hash-table-ref/default params "-setvars" #f))) (list 'runname runname) (list 'mt-bindir-path mt-bindir-path)))))))) (setenv "MT_CMDINFO" cmdparms) ;; setting this for use in nblauncher ;; clean out step records from previous run if they exist ;; (rmt:delete-test-step-records run-id test-id) ;; if the dir does not exist we may have a itempath where individual variables are a path, launch anyway (if (common:file-exists? work-area) (change-directory work-area)) ;; so that log files from the launch process don't clutter the test dir | > > | > | | < > < | < | | | | | | | | < | > > | | | > > > | | | | | | | | | | | > > > > > > > > > > > > > > > > > > > > > > > > | < | 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 | (list 'target mt_target) (list 'contour contour) (list 'runtlim (if run-time-limit (common:hms-string->seconds run-time-limit) #f)) (list 'env-ovrd (hash-table-ref/default *configdat* "env-override" '())) (list 'set-vars (if params (hash-table-ref/default params "-setvars" #f))) (list 'runname runname) (list 'mt-bindir-path mt-bindir-path)))))))) ;; save the cmdparms in the ajtdat (launch:ajt-exekey-set! ajtdat cmdparms) (setenv "MT_CMDINFO" cmdparms) ;; setting this for use in nblauncher ;; clean out step records from previous run if they exist ;; (rmt:delete-test-step-records run-id test-id) ;; if the dir does not exist we may have a itempath where individual variables are a path, launch anyway (if (common:file-exists? work-area) (change-directory work-area)) ;; so that log files from the launch process don't clutter the test dir ;; save the command line for adjutant mode (might never be needed but best to assemble it here) (launch:ajt-cmdline-set! ajtdat (string-intersperse (append (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param))) (cond (launcher (set! fullcmd (append launcher (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param))) (else (if (not useshell)(debug:print 0 *default-log-port* "WARNING: internal launching will not work well without \"useshell yes\" in your [jobtools] section")) (set! fullcmd (append (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param (list (if useshell "&" "")))))) (if (args:get-arg "-xterm")(set! fullcmd (append fullcmd (list "-xterm")))) (debug:print 1 *default-log-port* "Launching " work-area) ;; set pre-launch-env-vars before launching, keep the vars in prevvals and put the envionment back when done (debug:print 4 *default-log-port* "fullcmd: " fullcmd) (set! *last-launch* (current-seconds)) ;; all that junk above takes time, set this as late as possible. (let* ((env-override-vars (hash-table-ref/default *configdat* "env-override" '())) (commonprevvals (alist->env-vars env-override-vars)) (misc-vars (append (list (list "MT_TEST_RUN_DIR" work-area) (list "MT_TEST_NAME" test-name) (list "MT_ITEM_INFO" (conc itemdat)) (list "MT_RUNNAME" runname) (list "MT_TARGET" mt_target) (list "MT_ITEMPATH" item-path)) itemdat)) (miscprevvals (alist->env-vars misc-vars));; consolidate this code with the code in megatest.scm for "-execute" (test-vars (hash-table-ref/default tconfig "pre-launch-env-overrides" '())) (testprevvals (alist->env-vars test-vars)) ;; Launchwait defaults to true, must override it to turn off wait (launchwait (if (equal? (configf:lookup *configdat* "setup" "launchwait") "no") #f #t)) ;; BB: TODO: refactor this to examine return code of launcher, if nonzero, set state to launch failed. (launch-results-prev (if (eq? launcher-mode 'adjutant) '(#t 0) ;; just some fake data to fool downstream but non-applicable code (apply (if launchwait process:cmd-run-with-stderr-and-exitcode->list process-run) (if useshell (let ((cmdstr (string-intersperse fullcmd " "))) (if launchwait cmdstr (conc cmdstr " >> mt_launch.log 2>&1 &"))) (car fullcmd)) (if useshell '() (cdr fullcmd))))) (success (if launchwait (equal? 0 (cadr launch-results-prev)) #t)) (launch-results (if launchwait (car launch-results-prev) launch-results-prev))) (launch:ajt-add-vars ajtdat env-override-vars) (launch:ajt-add-vars ajtdat misc-vars) (launch:ajt-add-vars ajtdat test-vars) ;; if in adjutant mode we register the job in the jobs_queue ;; then fire off an adjutant runner ;; (if (eq? launcher-mode 'adjutant) (let* ((adjutant-runner-cmd (append (cdr launcher) (list remote-megatest "-adjutant" (launch:ajt-host-type ajtdat) "-start-dir" *toppath*))) (adj-cmd (conc (string-intersperse (map conc adjutant-runner-cmd) " ") "&"))) (rmt:no-sync-add-job (launch:ajt-host-type ajtdat) (launch:ajt-vars ajtdat) (launch:ajt-exekey ajtdat) (launch:ajt-cmdline ajtdat)) (print "adj-cmd: " adj-cmd) (system adj-cmd) )) (if (not success) (tests:test-set-status! run-id test-id "COMPLETED" "DEAD" "launcher failed; exited non-zero; check mt_launch.log" #f)) ;; (if launch-results launch-results "FAILED")) ;; (rmt:no-sync-del! lock-key) ;; release the lock for starting this test (if (not launchwait) ;; give the OS a little time to allow the process to start (thread-sleep! 0.01)) (with-output-to-file "mt_launch.log" (lambda () (print "LAUNCHCMD: " (string-intersperse fullcmd " ")) (if (list? launch-results) |
︙ | ︙ | |||
1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 | ;; but this hack will work! Thanks go to Alan Post of the Chicken email list ;; NB// Is this still needed? Should be safe to go back to "exit" now? (process-signal (current-process-id) signal/kill) )) (alist->env-vars miscprevvals) (alist->env-vars testprevvals) (alist->env-vars commonprevvals) launch-results)) (change-directory *toppath*) (thread-sleep! (configf:lookup-number *configdat* "setup" "inter-test-delay" default: 0.0)))) ;; recover a test where the top controlling mtest may have died ;; (define (launch:recover-test run-id test-id) | > > > > | 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 | ;; but this hack will work! Thanks go to Alan Post of the Chicken email list ;; NB// Is this still needed? Should be safe to go back to "exit" now? (process-signal (current-process-id) signal/kill) )) (alist->env-vars miscprevvals) (alist->env-vars testprevvals) (alist->env-vars commonprevvals) ;; yes, really should mutex all the way to here. Need to put this entire process into a fork. ;; the unlock previously was further up. This seemed wrong as we should not proceed until the ;; vars have been reset. (mutex-unlock! *launch-setup-mutex*) launch-results)) (change-directory *toppath*) (thread-sleep! (configf:lookup-number *configdat* "setup" "inter-test-delay" default: 0.0)))) ;; recover a test where the top controlling mtest may have died ;; (define (launch:recover-test run-id test-id) |
︙ | ︙ | |||
1820 1821 1822 1823 1824 1825 1826 | (read-symbolic-link (conc "/proc/" pid "/cwd")) #f))) ;; now wait on that process if all is correct ;; periodically update the db with runtime ;; when the process exits look at the db, if still RUNNING after 10 seconds set ;; state/status appropriately (process-wait pid))) | > > > > > > > > > > > > > > > > | 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 | (read-symbolic-link (conc "/proc/" pid "/cwd")) #f))) ;; now wait on that process if all is correct ;; periodically update the db with runtime ;; when the process exits look at the db, if still RUNNING after 10 seconds set ;; state/status appropriately (process-wait pid))) ;; (lock-key (conc "test-" test-id)) ;; (got-lock (let loop ((lock (rmt:no-sync-get-lock lock-key)) ;; (expire-time (+ (current-seconds) 15))) ;; give up on getting the lock and steal it after 15 seconds ;; (if (car lock) ;; #t ;; (if (> (current-seconds) expire-time) ;; (begin ;; (debug:print-info 0 *default-log-port* "Timed out waiting for a lock to launch test " keyvals " " runname " " test-name " " test-path) ;; (rmt:no-sync-del! lock-key) ;; destroy the lock ;; (loop (rmt:no-sync-get-lock lock-key) expire-time)) ;; ;; (begin ;; (thread-sleep! 1) ;; (loop (rmt:no-sync-get-lock lock-key) expire-time)))))) |
Modified megatest.scm from [458fe118db] to [c5c28080d4].
︙ | ︙ | |||
51 52 53 54 55 56 57 58 59 60 61 62 63 64 | (declare (uses db)) (declare (uses dbfile)) (declare (uses dbfile.import)) (declare (uses dbmod)) (declare (uses dbmod.import)) (declare (uses portlogger)) (declare (uses portlogger.import)) (declare (uses tcp-transportmod)) (declare (uses tcp-transportmod.import)) (declare (uses rmtmod)) (declare (uses rmtmod.import)) ;; (declare (uses debugprint)) ;; (declare (uses debugprint.import)) | > > > > > > > | 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 | (declare (uses db)) (declare (uses dbfile)) (declare (uses dbfile.import)) (declare (uses dbmod)) (declare (uses dbmod.import)) (declare (uses portlogger)) (declare (uses portlogger.import)) (declare (uses adjutant)) (import adjutant) (declare (uses mttop)) (import mttop) (declare (uses tcp-transportmod)) (declare (uses tcp-transportmod.import)) (declare (uses rmtmod)) (declare (uses rmtmod.import)) ;; (declare (uses debugprint)) ;; (declare (uses debugprint.import)) |
︙ | ︙ | |||
81 82 83 84 85 86 87 | (include "common_records.scm") (include "key_records.scm") (include "db_records.scm") (include "run_records.scm") (include "megatest-fossil-hash.scm") (use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69 (prefix base64 base64:)) | > > | > > | > > > > > > > > > | 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 | (include "common_records.scm") (include "key_records.scm") (include "db_records.scm") (include "run_records.scm") (include "megatest-fossil-hash.scm") (use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69 (prefix base64 base64:)) (use apropos call-with-environment-variables directory-utils extras format http-client json matchable readline srfi-18 tcp tcp-server typed-records ) ;; Added for csv stuff - will be removed ;; (use sparse-vectors) (require-library mutils) |
︙ | ︙ | |||
151 152 153 154 155 156 157 158 159 160 161 162 163 164 | version " megatest-version " license GPL, Copyright Matt Welland 2006-2017 Usage: megatest [options] -h : this help -manual : show the Megatest user manual -version : print megatest version (currently " megatest-version ") Launching and managing runs -run : run all tests or as specified by -testpatt -remove-runs : remove the data for a run, requires -runname and -testpatt Optionally use :state and :status, use -keep-records to remove only the run data. Use -kill-wait to override the 10 second per test wait after kill delay (e.g. -kill-wait 0). | > | 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 | version " megatest-version " license GPL, Copyright Matt Welland 2006-2017 Usage: megatest [options] -h : this help -manual : show the Megatest user manual -version : print megatest version (currently " megatest-version ") help : help for the new Megatest interface Launching and managing runs -run : run all tests or as specified by -testpatt -remove-runs : remove the data for a run, requires -runname and -testpatt Optionally use :state and :status, use -keep-records to remove only the run data. Use -kill-wait to override the 10 second per test wait after kill delay (e.g. -kill-wait 0). |
︙ | ︙ | |||
248 249 250 251 252 253 254 | -sync-to-megatest.db : pull data from cache files in /tmp/$USER to megatest.db -sync-to dest : sync to new postgresql central style database -update-meta : update the tests metadata for all tests -setvars VAR1=val1,VAR2=val2 : Add environment variables to a run NB// these are overwritten by values set in config files. -server -|hostname : start the server (reduces contention on megatest.db), use - to automatically figure out hostname | | | 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 | -sync-to-megatest.db : pull data from cache files in /tmp/$USER to megatest.db -sync-to dest : sync to new postgresql central style database -update-meta : update the tests metadata for all tests -setvars VAR1=val1,VAR2=val2 : Add environment variables to a run NB// these are overwritten by values set in config files. -server -|hostname : start the server (reduces contention on megatest.db), use - to automatically figure out hostname -adjutant host-type : start the server/adjutant with given host-type use 0,0 to auto use full machine -transport http|rpc : use http or rpc for transport (default is http) -log logfile : send stdout and stderr to logfile -list-servers : list the servers -kill-servers : kill all servers -repl : start a repl (useful for extending megatest) -load file.scm : load and run file.scm |
︙ | ︙ | |||
320 321 322 323 324 325 326 327 328 329 330 331 332 333 | Called as " (string-intersperse (argv) " ") " Version " megatest-version ", built from " megatest-fossil-hash )) ;; -gui : start a gui interface ;; -config fname : override the runconfigs file with fname ;; process args (define remargs (args:get-args (argv) (list "-runtests" ;; run a specific test "-config" ;; override the config file name "-append-config" "-execute" ;; run the command encoded in the base64 parameter | > > > > | 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 | Called as " (string-intersperse (argv) " ") " Version " megatest-version ", built from " megatest-fossil-hash )) ;; -gui : start a gui interface ;; -config fname : override the runconfigs file with fname (mttop-run (command-line-arguments) '("help")) ;; process args (define remargs (args:get-args (argv) (list "-runtests" ;; run a specific test "-config" ;; override the config file name "-append-config" "-execute" ;; run the command encoded in the base64 parameter |
︙ | ︙ | |||
988 989 990 991 992 993 994 995 996 997 998 | (tt:start-server tl #f dbfname api:tcp-dispatch-request-make-handler keys) (begin (debug:print 0 *default-log-port* "ERROR: transport mode is tcp - -db is required.") (exit 1))))) (else (debug:print 0 *default-log-port* "ERROR: rmt:transport-mode value not recognised "(rmt:transport-mode)))) (set! *didsomething* #t))) ;; The adjutant is a bit different, it does NOT run (launch:setup) as it is not necessarily tied to ;; a specific Megatest area. Detail are being hashed out and this may change. ;; (if (args:get-arg "-adjutant") | > > > > > > > > > > > > > > > > > > > > > | > > > > > > > > > > | | 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 | (tt:start-server tl #f dbfname api:tcp-dispatch-request-make-handler keys) (begin (debug:print 0 *default-log-port* "ERROR: transport mode is tcp - -db is required.") (exit 1))))) (else (debug:print 0 *default-log-port* "ERROR: rmt:transport-mode value not recognised "(rmt:transport-mode)))) (set! *didsomething* #t))) (define (naylist->alist inlst) (map (lambda (dat) (cons (car dat) (or (if (list? (cdr dat)) (if (null? (cdr dat)) "" (cadr dat)) (cdr dat)) ""))) ;; we need a string for call-with-environment-variables inlst)) ;; The adjutant is a bit different, it does NOT run (launch:setup) as it is not necessarily tied to ;; a specific Megatest area. Detail are being hashed out and this may change. ;; (if (args:get-arg "-adjutant") (let* ((host-type (args:get-arg "-adjutant"))) (launch:setup) ;; dang it, wish this wasn't needed (print "Running the adjutant!") (let loop ((wait-count 0)) (if (< wait-count 10) ;; 6 x 10 seconds = one minute (let* ((dat (rmt:no-sync-take-job host-type))) (match dat ((id ht vars exekey cmdline state event-time last-update) (let ((vars-alist (with-input-from-string vars read) )) (print "Vars:") (pp vars-alist) (call-with-environment-variables (naylist->alist vars-alist) (lambda () (system cmdline)))) (loop 0)) (else (thread-sleep! 10) (loop (+ wait-count 1))))) (print "I'm bored. Exiting."))) ;; (adjutant-run (args:get-arg "-ajutant") rmt:no-sync-take-job) (set! *didsomething* #t))) (if (args:get-arg "-list-servers") (let* ((tl (launch:setup)) ;; need this to initialize *toppath* (servdir (tt:get-servinfo-dir *toppath*)) (servfiles (glob (conc servdir "/*:*.db"))) (fmtstr "~10a~22a~10a~25a~25a~8a\n") |
︙ | ︙ |
Added mttop.scm version [0ba1c89f48].
> > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 | ;; Copyright 2006-2011, Matthew Welland. ;; ;; This program is made available under the GNU GPL version 2.0 or ;; greater. See the accompanying file COPYING for details. ;; ;; This program is distributed WITHOUT ANY WARRANTY; without even the ;; implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR ;; PURPOSE. ;; This is from the perl world, a hash of hashes is a super easy way to keep a handle on ;; lots of disparate data ;; (declare (unit mttop)) (module mttop * (import chicken scheme ;; data-structures posix srfi-1 ;; srfi-13 srfi-69 ports extras regex posix data-structures matchable ) (define (str-is-cmd cmd all-cmds) (let* ((rx (regexp (conc "^" cmd ".*"))) (mx (filter string? (map (lambda (x) (let ((res (string-match rx x))) (if res (car res) #f))) all-cmds)))) (if (eq? (length mx) 1) ;; have a command (car mx) #f))) (define (mttop-run args all-cmds) ;; any path through this call must end in exit if it is NOT an old Megatest call (if (null? args) #f ;; continue on and do the old Megatest stuff (let ((cmd (str-is-cmd (car args) all-cmds))) (if cmd (begin (case (string->symbol cmd) ((help)(print "New help")) (else (print "Command " cmd " is not implemented yet."))) (exit)) ;; always exit here #f)))) ;; or continue on to Megatest old stuff here ) |
Modified rmt.scm from [0cdd3c737a] to [d0aaf6cd91].
︙ | ︙ | |||
726 727 728 729 730 731 732 733 734 735 736 737 738 739 | (rmt:send-receive 'no-sync-get/default #f `(,var ,default))) (define (rmt:no-sync-del! var) (rmt:send-receive 'no-sync-del! #f `(,var))) (define (rmt:no-sync-get-lock keyname) (rmt:send-receive 'no-sync-get-lock #f `(,keyname))) ;; process registration (define (rmt:register-process host port pid starttime status purpose dbname mtversion) (rmt:send-receive 'register-process #f (list host port pid starttime status purpose dbname mtversion))) (define (rmt:set-process-done host pid reason) | > > > > > > > > > | 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 | (rmt:send-receive 'no-sync-get/default #f `(,var ,default))) (define (rmt:no-sync-del! var) (rmt:send-receive 'no-sync-del! #f `(,var))) (define (rmt:no-sync-get-lock keyname) (rmt:send-receive 'no-sync-get-lock #f `(,keyname))) (define (rmt:no-sync-add-job host-type vars-list exekey cmdline) (rmt:send-receive 'no-sync-add-job #f `(,host-type ,vars-list ,exekey ,cmdline))) (define (rmt:no-sync-take-job host-type) (rmt:send-receive 'no-sync-take-job #f `(,host-type))) (define (rmt:no-sync-job-records-clean) (rmt:set-receive 'no-sync-job-records-clean #f '())) ;; process registration (define (rmt:register-process host port pid starttime status purpose dbname mtversion) (rmt:send-receive 'register-process #f (list host port pid starttime status purpose dbname mtversion))) (define (rmt:set-process-done host pid reason) |
︙ | ︙ |