Overview
Comment: | job rate, load limit working for MTLOWESTLOAD |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.65 |
Files: | files | file ages | folders |
SHA1: |
f1ae1885662d78efd7bc206352c21a1a |
User & Date: | matt on 2018-02-05 23:33:47 |
Other Links: | branch diff | manifest | tags |
Context
2018-02-06
| ||
05:59 | Rebuilt manual check-in: c57a166878 user: matt tags: v1.65 | |
2018-02-05
| ||
23:33 | job rate, load limit working for MTLOWESTLOAD check-in: f1ae188566 user: matt tags: v1.65 | |
17:02 | Updated megatest-version check-in: ab7e6db80c user: jmoon18 tags: v1.65, v1.6508 | |
Changes
Modified common.scm from [87be6fd2a8] to [9660178c65].
︙ | ︙ | |||
459 460 461 462 463 464 465 466 467 468 469 470 471 472 | (common:to-alist (cdr dat)))) ((hash-table? dat) (map common:to-alist (hash-table->alist dat))) (else (if dat dat "")))) (define (common:low-noise-print waitval . keys) (let* ((key (string-intersperse (map conc keys) "-" )) (lasttime (hash-table-ref/default *common:denoise* key 0)) (currtime (current-seconds))) (if (> (- currtime lasttime) waitval) (begin | > > > | 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 | (common:to-alist (cdr dat)))) ((hash-table? dat) (map common:to-alist (hash-table->alist dat))) (else (if dat dat "")))) (define (common:alist-ref/default key alist default) (or (alist-ref key alist) default)) (define (common:low-noise-print waitval . keys) (let* ((key (string-intersperse (map conc keys) "-" )) (lasttime (hash-table-ref/default *common:denoise* key 0)) (currtime (current-seconds))) (if (> (- currtime lasttime) waitval) (begin |
︙ | ︙ | |||
1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 | (loop (car tal) (cdr tal) new-rownames new-colnames (if (> curr-rownum rownum) curr-rownum rownum) (if (> curr-colnum colnum) curr-colnum colnum) )))))) ;;====================================================================== ;; S Y S T E M S T U F F ;;====================================================================== ;; lazy-safe get file mod time. on any error (file not existing etc.) return 0 ;; | > > > > > > > > > > > > > > > > > > > > > > > | 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 | (loop (car tal) (cdr tal) new-rownames new-colnames (if (> curr-rownum rownum) curr-rownum rownum) (if (> curr-colnum colnum) curr-colnum colnum) )))))) ;; if it looks like a number -> convert it to a number, else return it ;; (define (common:lazy-convert inval) (let* ((as-num (if (string? inval)(string->number inval) #f))) (or as-num inval))) ;; convert string a=1; b=2; c=a silly thing; d= ;; to '((a . 1)(b . 2)(c . "a silly thing")(d . "")) ;; (define (common:val->alist val #!key (convert #f)) (let ((val-list (string-split-fields ";\\s*" val #:infix))) (if val-list (map (lambda (x) (let ((f (string-split-fields "\\s*=\\s*" x #:infix))) (case (length f) ((0) `(,#f)) ;; null string case ((1) `(,(string->symbol (car f)))) ((2) `(,(string->symbol (car f)) . ,(let ((inval (cadr f))) (if convert (common:lazy-convert inval) inval)))) (else f)))) val-list) '()))) ;;====================================================================== ;; S Y S T E M S T U F F ;;====================================================================== ;; lazy-safe get file mod time. on any error (file not existing etc.) return 0 ;; |
︙ | ︙ | |||
1538 1539 1540 1541 1542 1543 1544 | (eq? res 0))) ;; ideally put all this info into the db, no need to preserve it across moving homehost ;; ;; return list of ;; ( reachable? cpuload update-time ) (define (common:get-host-info hostname) | | | | | | | | | > | > > > | 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 | (eq? res 0))) ;; ideally put all this info into the db, no need to preserve it across moving homehost ;; ;; return list of ;; ( reachable? cpuload update-time ) (define (common:get-host-info hostname) (let* ((loadinfo (rmt:get-latest-host-load hostname)) ;; if this host happens to have been recently used by a test reuse the load data (load (car loadinfo)) (load-sample-time (cdr loadinfo)) (load-sample-age (- (current-seconds) load-sample-time)) (loadinfo-timeout-seconds 6) ;; this was 20 seconds, seems way too lax. Switch to 6 seconds (host-last-update-timeout-seconds 4) (host-rec (hash-table-ref/default *host-loads* hostname #f)) ) (cond ((< load-sample-age loadinfo-timeout-seconds) (list #t load-sample-time load)) ((and host-rec (< (current-seconds) (+ (host-last-update host-rec) host-last-update-timeout-seconds))) (list #t (host-last-update host-rec) (host-last-cpuload host-rec ))) ((common:unix-ping hostname) (list #t (current-seconds) (alist-ref 'adj-core-load (common:get-normalized-cpu-load hostname)))) ;; this is cheaper than you might think. get-normalized-cpu-load is cached for up to 5 seconds (else (list #f 0 -1) ;; bad host, don't use! )))) ;; see defstruct host at top of file. ;; host: reachable last-update last-used last-cpuload ;; (define (common:update-host-loads-table hosts-raw) (let* ((hosts (filter (lambda (x) (string-match (regexp "^\\S+$") x)) hosts-raw))) (for-each (lambda (hostname) (let* ((rec (let ((h (hash-table-ref/default *host-loads* hostname #f))) |
︙ | ︙ | |||
1584 1585 1586 1587 1588 1589 1590 | (last-reached-time (cadr host-info)) (load (caddr host-info))) (host-reachable-set! rec is-reachable) (host-last-update-set! rec last-reached-time) (host-last-cpuload-set! rec load))) hosts))) | > > > | > > > > > | | | | > > > > > > > > | > > > > > > > > > > > > | > | | | < < < < < < | | > > > > > > | | < > | > > | < | | > > > > > > > | 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 | (last-reached-time (cadr host-info)) (load (caddr host-info))) (host-reachable-set! rec is-reachable) (host-last-update-set! rec last-reached-time) (host-last-cpuload-set! rec load))) hosts))) ;; go through the hosts from least recently used to most recently used, pick the first that meets the load criteral from the ;; [host-rules] section. ;; (define (common:get-least-loaded-host hosts-raw host-type configdat) (let* ((rdat (configf:lookup configdat "host-rules" host-type)) (rules (common:val->alist (or rdat "") convert: #t)) ;; maxnload, maxnjobs, maxjobrate (maxnload (common:alist-ref/default 'maxnload rules 1.5)) ;; max normalized load (maxnjobs (common:alist-ref/default 'maxnjobs rules 1.5)) ;; max normalized number of jobs (maxjobrate (common:alist-ref/default 'maxjobrate rules (/ 1 6))) ;; max rate of submitting jobs to a given host in jobs/second (hosts (filter (lambda (x) (string-match (regexp "^\\S+$") x)) hosts-raw)) ;; (best-host #f) (get-rec (lambda (hostname) ;; (print "get-rec hostname=" hostname) (let ((h (hash-table-ref/default *host-loads* hostname #f))) (if h h (let ((h (make-host))) (hash-table-set! *host-loads* hostname h) h))))) (best-load 99999) (curr-time (current-seconds)) (get-hosts-sorted (lambda (hosts) (sort hosts (lambda (a b) (let ((a-rec (get-rec a)) (b-rec (get-rec b))) ;; (print "a=" a " a-rec=" a-rec " host-last-used=" (host-last-used a-rec)) ;; (print "b=" b " b-rec=" b-rec " host-last-used=" (host-last-used b-rec)) (< (host-last-used a-rec) (host-last-used b-rec)))))))) (debug:print 0 *default-log-port* "INFO: hosts-sorted=" (get-hosts-sorted hosts)) (if (null? hosts) #f ;; no hosts to select from. All done and giving up now. (let ((hosts-sorted (get-hosts-sorted hosts))) (common:update-host-loads-table hosts) (let loop ((hostname (car hosts-sorted)) (tal (cdr hosts-sorted)) (best-host #f)) (let* ((rec (get-rec hostname)) (reachable (host-reachable rec)) (load (host-last-cpuload rec)) (last-used (host-last-used rec)) (delta (- curr-time last-used)) (job-rate (if (> delta 0) (/ 1 delta) 999)) ;; jobs per second (new-best (cond ((not reachable) (debug:print 0 *default-log-port* "Skipping host " hostname " as it cannot be reached.") best-host) ((and (< load maxnload) ;; load is acceptable (< job-rate maxjobrate)) ;; job rate is acceptable (set! best-load load) hostname) (else best-host)))) (debug:print 0 *default-log-port* "INFO: Trying host " hostname " with load " load ", last used " delta " seconds ago, with job-rate " job-rate " for running a test." ) (if new-best (begin ;; found a host, return it (debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate) (host-last-used-set! rec curr-time) new-best) (if (null? tal) #f (loop (car tal)(cdr tal) best-host))))))))) (define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)) (let* ((loadavg (common:get-cpu-load remote-host)) (numcpus (if (< 1 numcpus-in) ;; not possible (common:get-num-cpus remote-host) numcpus-in)) (maxload (max maxload-in 0.5)) ;; so maxload must be greater than 0.5 for now BUG - FIXME? |
︙ | ︙ | |||
2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 | ;; x86_64 zeus xena myth01 ;; allhosts #{g hosts arm} #{g hosts x86_64} ;; ;; [host-types] ;; general #MTLOWESTLOAD #{g hosts allhosts} ;; arm #MTLOWESTLOAD #{g hosts arm} ;; nbgeneral nbjob run JOBCOMMAND -log $MT_LINKTREE/$MT_TARGET/$MT_RUNNAME.$MT_TESTNAME-$MT_ITEM_PATH.lgo ;; ;; [launchers] ;; envsetup general ;; xor/%/n 4C16G ;; % nbgeneral ;; ;; [jobtools] | > > > > > > | 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 | ;; x86_64 zeus xena myth01 ;; allhosts #{g hosts arm} #{g hosts x86_64} ;; ;; [host-types] ;; general #MTLOWESTLOAD #{g hosts allhosts} ;; arm #MTLOWESTLOAD #{g hosts arm} ;; nbgeneral nbjob run JOBCOMMAND -log $MT_LINKTREE/$MT_TARGET/$MT_RUNNAME.$MT_TESTNAME-$MT_ITEM_PATH.lgo ;; ;; [host-rules] ;; # maxnload => max normalized load ;; # maxnjobs => max jobs per cpu ;; # maxjobrate => max jobs per second ;; general maxnload=1.1; maxnjobs=1.2; maxjobrate=0.1 ;; ;; [launchers] ;; envsetup general ;; xor/%/n 4C16G ;; % nbgeneral ;; ;; [jobtools] |
︙ | ︙ | |||
2404 2405 2406 2407 2408 2409 2410 | (begin (debug:print-info 2 *default-log-port* "Have flexi-launcher match for " testname "/" itempath " = " host-type) (let ((launcher (configf:lookup configdat "host-types" host-type))) (if launcher (let* ((launcher-parts (string-split launcher)) (launcher-exe (car launcher-parts))) (if (equal? launcher-exe "#MTLOWESTLOAD") ;; this is our special case, we will find the lowest load and craft a nbfake commandline | | > > | > > > > > > > > > | 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 | (begin (debug:print-info 2 *default-log-port* "Have flexi-launcher match for " testname "/" itempath " = " host-type) (let ((launcher (configf:lookup configdat "host-types" host-type))) (if launcher (let* ((launcher-parts (string-split launcher)) (launcher-exe (car launcher-parts))) (if (equal? launcher-exe "#MTLOWESTLOAD") ;; this is our special case, we will find the lowest load and craft a nbfake commandline (let host-loop ((targ-host (common:get-least-loaded-host (cdr launcher-parts) host-type configdat)) (count 100)) (if targ-host (conc "remrun " targ-host) (if (> count 0) (begin (debug:print 0 *default-log-port* "INFO: Waiting for a host for host-type " host-type) (thread-sleep! (- 101 count)) (host-loop (common:get-least-loaded-host (cdr launcher-parts) host-type configdat) (- count 1))) (begin (debug:print 0 *default-log-port* "FATAL: Failed to find a host from #MTLOWESTLOAD for host-type " host-type) (exit))))) launcher)) (begin (debug:print-info 0 *default-log-port* "WARNING: no launcher found for host-type " host-type) (if (null? tal) fallback-launcher (loop (car tal)(cdr tal))))))) ;; no match, try again |
︙ | ︙ |
Modified mtut.scm from [8d657d6f02] to [401524b519].
︙ | ︙ | |||
263 264 265 266 267 268 269 | ;; given a mtutil param, return the old megatest equivalent ;; (define (megatest-param->mtutil-param param) (let* ((mapping-alist (common:get-param-mapping flavor: 'switch-symbol))) (alist-ref (string->symbol param) mapping-alist eq? param) param)) | | < < < < < < < < < < < | 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 | ;; given a mtutil param, return the old megatest equivalent ;; (define (megatest-param->mtutil-param param) (let* ((mapping-alist (common:get-param-mapping flavor: 'switch-symbol))) (alist-ref (string->symbol param) mapping-alist eq? param) param)) (define val->alist common:val->alist) (define (push-run-spec torun contour runkey spec) (configf:section-var-set! torun contour runkey (cons spec (or (configf:lookup torun contour runkey) '())))) |
︙ | ︙ | |||
545 546 547 548 549 550 551 | ;; ;; Override the run start time record with sched. Usually #f is fine. ;; (define (create-run-pkt mtconf action area runkey target runname mode-patt tag-expr pktsdir reason contour sched dbdest append-conf runtrans) (let* ((good-val (lambda (inval)(and inval (string? inval)(not (string-null? inval))))) | | | 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 | ;; ;; Override the run start time record with sched. Usually #f is fine. ;; (define (create-run-pkt mtconf action area runkey target runname mode-patt tag-expr pktsdir reason contour sched dbdest append-conf runtrans) (let* ((good-val (lambda (inval)(and inval (string? inval)(not (string-null? inval))))) (area-dat (common:val->alist (or (configf:lookup mtconf "areas" area) ""))) (area-path (alist-ref 'path area-dat)) ;; (area-xlatr (alist-ref 'targtrans area-dat)) ;; (xlatr-key (if area-xlatr (string->symbol area-xlatr) #f)) (new-runname (let* ((callname (if (string? runtrans)(string->symbol runtrans) #f)) (mapper (if callname (hash-table-ref/default *runname-mappers* callname #f) #f))) ;; (print "callname=" callname " runtrans=" runtrans " mapper=" mapper) (if (and callname |
︙ | ︙ | |||
669 670 671 672 673 674 675 | (keyparts (string-split key ":")) ;; contour:ruletype:action:optional (contour (car keyparts)) (len-key (length keyparts)) (ruletype (if (> len-key 1)(cadr keyparts) #f)) (action (if (> len-key 2)(caddr keyparts) #f)) (optional (if (> len-key 3)(cadddr keyparts) #f)) ;; (val-list (string-split-fields ";\\s*" val #:infix)) ;; (string-split val)) ;; runname-rule params | | | 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 | (keyparts (string-split key ":")) ;; contour:ruletype:action:optional (contour (car keyparts)) (len-key (length keyparts)) (ruletype (if (> len-key 1)(cadr keyparts) #f)) (action (if (> len-key 2)(caddr keyparts) #f)) (optional (if (> len-key 3)(cadddr keyparts) #f)) ;; (val-list (string-split-fields ";\\s*" val #:infix)) ;; (string-split val)) ;; runname-rule params (val-alist (common:val->alist val)) (runname (make-runname "" "")) (runtrans (alist-ref 'runtrans val-alist)) ;; these may or may not be defined and not all are used in each handler type in the case below (run-name (alist-ref 'run-name val-alist)) (target (alist-ref 'target val-alist)) (crontab (alist-ref 'cron val-alist)) |
︙ | ︙ | |||
914 915 916 917 918 919 920 | keydats))) ;; sense rules (hash-table-keys rgconf)) ;; now have to run populated (for-each (lambda (contour) (let* ((cval (or (configf:lookup mtconf "contours" contour) "")) | | | | 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 | keydats))) ;; sense rules (hash-table-keys rgconf)) ;; now have to run populated (for-each (lambda (contour) (let* ((cval (or (configf:lookup mtconf "contours" contour) "")) (cval-alist (common:val->alist cval)) ;; BEWARE ... NOT the same val-alist as above! (areas (val-alist->areas cval-alist)) (selector (alist-ref 'selector cval-alist)) (mode-tag (and selector (string-split-fields "/" selector #:infix))) (mode-patt (and mode-tag (if (eq? (length mode-tag) 2)(cadr mode-tag) #f))) (tag-expr (and mode-tag (if (null? mode-tag) #f (car mode-tag))))) (print "contour: " contour " areas=" areas " cval=" cval) (for-each (lambda (runkeydatset) ;; (print "runkeydatset: ")(pp runkeydatset) (let ((runkey (car runkeydatset)) (runkeydats (cadr runkeydatset))) (for-each (lambda (runkeydat) (for-each (lambda (area) (if (area-allowed? area areas runkey contour mode-patt) ;; is this area to be handled (from areas=a,b,c OR using areafn=abcfn and *area-checks* ...) (let* ((aval (or (configf:lookup mtconf "areas" area) "")) (aval-alist (common:val->alist aval)) (runname (alist-ref 'runname runkeydat)) (runtrans (alist-ref 'runtrans runkeydat)) (reason (alist-ref 'message runkeydat)) (sched (alist-ref 'sched runkeydat)) (action (alist-ref 'action runkeydat)) (dbdest (alist-ref 'dbdest runkeydat)) |
︙ | ︙ | |||
1111 1112 1113 1114 1115 1116 1117 | (if *action* (case (string->symbol *action*) ((run remove rerun set-ss archive kill list) (let* ((mtconfdat (simple-setup (args:get-arg "-start-dir"))) (mtconf (car mtconfdat)) (area (args:get-arg "-area")) ;; look up the area to dispatch to from [areas] section (areasec (if area (configf:lookup mtconf "areas" area) #f)) | | | 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 | (if *action* (case (string->symbol *action*) ((run remove rerun set-ss archive kill list) (let* ((mtconfdat (simple-setup (args:get-arg "-start-dir"))) (mtconf (car mtconfdat)) (area (args:get-arg "-area")) ;; look up the area to dispatch to from [areas] section (areasec (if area (configf:lookup mtconf "areas" area) #f)) (areadat (if areasec (common:val->alist areasec) #f)) (area-path (if areadat (alist-ref 'path areadat) #f)) (pktsdirs (configf:lookup mtconf "setup" "pktsdirs")) (pktsdir (if pktsdirs (car (string-split pktsdirs " ")) #f)) (adjargs (hash-table-copy args:arg-hash)) (new-ss (args:get-arg "-new"))) ;; check a few things (cond |
︙ | ︙ |