︙ | | | ︙ | |
459
460
461
462
463
464
465
466
467
468
469
470
471
472
|
(common:to-alist (cdr dat))))
((hash-table? dat)
(map common:to-alist (hash-table->alist dat)))
(else
(if dat
dat
""))))
(define (common:low-noise-print waitval . keys)
(let* ((key (string-intersperse (map conc keys) "-" ))
(lasttime (hash-table-ref/default *common:denoise* key 0))
(currtime (current-seconds)))
(if (> (- currtime lasttime) waitval)
(begin
|
>
>
>
|
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
|
(common:to-alist (cdr dat))))
((hash-table? dat)
(map common:to-alist (hash-table->alist dat)))
(else
(if dat
dat
""))))
(define (common:alist-ref/default key alist default)
(or (alist-ref key alist) default))
(define (common:low-noise-print waitval . keys)
(let* ((key (string-intersperse (map conc keys) "-" ))
(lasttime (hash-table-ref/default *common:denoise* key 0))
(currtime (current-seconds)))
(if (> (- currtime lasttime) waitval)
(begin
|
︙ | | | ︙ | |
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
|
(loop (car tal)
(cdr tal)
new-rownames
new-colnames
(if (> curr-rownum rownum) curr-rownum rownum)
(if (> curr-colnum colnum) curr-colnum colnum)
))))))
;;======================================================================
;; S Y S T E M S T U F F
;;======================================================================
;; lazy-safe get file mod time. on any error (file not existing etc.) return 0
;;
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
|
(loop (car tal)
(cdr tal)
new-rownames
new-colnames
(if (> curr-rownum rownum) curr-rownum rownum)
(if (> curr-colnum colnum) curr-colnum colnum)
))))))
;; if it looks like a number -> convert it to a number, else return it
;;
(define (common:lazy-convert inval)
(let* ((as-num (if (string? inval)(string->number inval) #f)))
(or as-num inval)))
;; convert string a=1; b=2; c=a silly thing; d=
;; to '((a . 1)(b . 2)(c . "a silly thing")(d . ""))
;;
(define (common:val->alist val #!key (convert #f))
(let ((val-list (string-split-fields ";\\s*" val #:infix)))
(if val-list
(map (lambda (x)
(let ((f (string-split-fields "\\s*=\\s*" x #:infix)))
(case (length f)
((0) `(,#f)) ;; null string case
((1) `(,(string->symbol (car f))))
((2) `(,(string->symbol (car f)) . ,(let ((inval (cadr f)))
(if convert (common:lazy-convert inval) inval))))
(else f))))
val-list)
'())))
;;======================================================================
;; S Y S T E M S T U F F
;;======================================================================
;; lazy-safe get file mod time. on any error (file not existing etc.) return 0
;;
|
︙ | | | ︙ | |
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
|
(eq? res 0)))
;; ideally put all this info into the db, no need to preserve it across moving homehost
;;
;; return list of
;; ( reachable? cpuload update-time )
(define (common:get-host-info hostname)
(let* ((loadinfo (rmt:get-latest-host-load hostname))
(load (car loadinfo))
(load-sample-time (cdr loadinfo))
(load-sample-age (- (current-seconds) load-sample-time))
(loadinfo-timeout-seconds 20)
(host-last-update-timeout-seconds 10)
(host-rec (hash-table-ref/default *host-loads* hostname #f))
)
(cond
((< load-sample-age loadinfo-timeout-seconds)
(list #t
load-sample-time
load))
((and host-rec
(< (current-seconds) (+ (host-last-update host-rec) host-last-update-timeout-seconds)))
(list #t
(host-last-update host-rec)
(host-last-cpuload host-rec )))
((common:unix-ping hostname)
(list #t
(current-seconds)
(alist-ref 'adj-core-load (common:get-normalized-cpu-load hostname))))
(else
(list #f 0 -1)))))
(define (common:update-host-loads-table hosts-raw)
(let* ((hosts (filter (lambda (x)
(string-match (regexp "^\\S+$") x))
hosts-raw)))
(for-each
(lambda (hostname)
(let* ((rec (let ((h (hash-table-ref/default *host-loads* hostname #f)))
|
|
|
|
|
|
|
|
|
>
|
>
>
>
|
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
|
(eq? res 0)))
;; ideally put all this info into the db, no need to preserve it across moving homehost
;;
;; return list of
;; ( reachable? cpuload update-time )
(define (common:get-host-info hostname)
(let* ((loadinfo (rmt:get-latest-host-load hostname)) ;; if this host happens to have been recently used by a test reuse the load data
(load (car loadinfo))
(load-sample-time (cdr loadinfo))
(load-sample-age (- (current-seconds) load-sample-time))
(loadinfo-timeout-seconds 6) ;; this was 20 seconds, seems way too lax. Switch to 6 seconds
(host-last-update-timeout-seconds 4)
(host-rec (hash-table-ref/default *host-loads* hostname #f))
)
(cond
((< load-sample-age loadinfo-timeout-seconds)
(list #t
load-sample-time
load))
((and host-rec
(< (current-seconds) (+ (host-last-update host-rec) host-last-update-timeout-seconds)))
(list #t
(host-last-update host-rec)
(host-last-cpuload host-rec )))
((common:unix-ping hostname)
(list #t
(current-seconds)
(alist-ref 'adj-core-load (common:get-normalized-cpu-load hostname)))) ;; this is cheaper than you might think. get-normalized-cpu-load is cached for up to 5 seconds
(else
(list #f 0 -1) ;; bad host, don't use!
))))
;; see defstruct host at top of file.
;; host: reachable last-update last-used last-cpuload
;;
(define (common:update-host-loads-table hosts-raw)
(let* ((hosts (filter (lambda (x)
(string-match (regexp "^\\S+$") x))
hosts-raw)))
(for-each
(lambda (hostname)
(let* ((rec (let ((h (hash-table-ref/default *host-loads* hostname #f)))
|
︙ | | | ︙ | |
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
|
(last-reached-time (cadr host-info))
(load (caddr host-info)))
(host-reachable-set! rec is-reachable)
(host-last-update-set! rec last-reached-time)
(host-last-cpuload-set! rec load)))
hosts)))
(define (common:get-least-loaded-host hosts-raw)
(let* ((hosts (filter (lambda (x)
(string-match (regexp "^\\S+$") x))
hosts-raw))
(best-host #f)
(best-load 99999)
(curr-time (current-seconds)))
(common:update-host-loads-table hosts)
(for-each
(lambda (hostname)
(let* ((rec
(let ((h (hash-table-ref/default *host-loads* hostname #f)))
(if h
h
(let ((h (make-host)))
(hash-table-set! *host-loads* hostname h)
h))))
(reachable (host-reachable rec))
(load (host-last-cpuload rec)))
(cond
((not reachable) #f)
((< (+ load (/ (random 250) 1000)) ;; add a random factor to keep from getting in a rut
(+ best-load (/ (random 250) 1000)) )
(set! best-load load)
(set! best-host hostname)))))
hosts)
best-host))
(define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f))
(let* ((loadavg (common:get-cpu-load remote-host))
(numcpus (if (< 1 numcpus-in) ;; not possible
(common:get-num-cpus remote-host)
numcpus-in))
(maxload (max maxload-in 0.5)) ;; so maxload must be greater than 0.5 for now BUG - FIXME?
|
>
>
>
|
>
>
>
>
>
|
|
|
|
>
>
>
>
>
>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
|
>
|
|
|
<
<
<
<
<
<
|
|
>
>
>
>
>
>
|
|
<
>
|
>
>
|
<
|
|
>
>
>
>
>
>
>
|
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
|
(last-reached-time (cadr host-info))
(load (caddr host-info)))
(host-reachable-set! rec is-reachable)
(host-last-update-set! rec last-reached-time)
(host-last-cpuload-set! rec load)))
hosts)))
;; go through the hosts from least recently used to most recently used, pick the first that meets the load criteral from the
;; [host-rules] section.
;;
(define (common:get-least-loaded-host hosts-raw host-type configdat)
(let* ((rdat (configf:lookup configdat "host-rules" host-type))
(rules (common:val->alist (or rdat "") convert: #t)) ;; maxnload, maxnjobs, maxjobrate
(maxnload (common:alist-ref/default 'maxnload rules 1.5)) ;; max normalized load
(maxnjobs (common:alist-ref/default 'maxnjobs rules 1.5)) ;; max normalized number of jobs
(maxjobrate (common:alist-ref/default 'maxjobrate rules (/ 1 6))) ;; max rate of submitting jobs to a given host in jobs/second
(hosts (filter (lambda (x)
(string-match (regexp "^\\S+$") x))
hosts-raw))
;; (best-host #f)
(get-rec (lambda (hostname)
;; (print "get-rec hostname=" hostname)
(let ((h (hash-table-ref/default *host-loads* hostname #f)))
(if h
h
(let ((h (make-host)))
(hash-table-set! *host-loads* hostname h)
h)))))
(best-load 99999)
(curr-time (current-seconds))
(get-hosts-sorted (lambda (hosts)
(sort hosts (lambda (a b)
(let ((a-rec (get-rec a))
(b-rec (get-rec b)))
;; (print "a=" a " a-rec=" a-rec " host-last-used=" (host-last-used a-rec))
;; (print "b=" b " b-rec=" b-rec " host-last-used=" (host-last-used b-rec))
(< (host-last-used a-rec)
(host-last-used b-rec))))))))
(debug:print 0 *default-log-port* "INFO: hosts-sorted=" (get-hosts-sorted hosts))
(if (null? hosts)
#f ;; no hosts to select from. All done and giving up now.
(let ((hosts-sorted (get-hosts-sorted hosts)))
(common:update-host-loads-table hosts)
(let loop ((hostname (car hosts-sorted))
(tal (cdr hosts-sorted))
(best-host #f))
(let* ((rec (get-rec hostname))
(reachable (host-reachable rec))
(load (host-last-cpuload rec))
(last-used (host-last-used rec))
(delta (- curr-time last-used))
(job-rate (if (> delta 0)
(/ 1 delta)
999)) ;; jobs per second
(new-best
(cond
((not reachable)
(debug:print 0 *default-log-port* "Skipping host " hostname " as it cannot be reached.")
best-host)
((and (< load maxnload) ;; load is acceptable
(< job-rate maxjobrate)) ;; job rate is acceptable
(set! best-load load)
hostname)
(else best-host))))
(debug:print 0 *default-log-port* "INFO: Trying host " hostname " with load " load ", last used " delta " seconds ago, with job-rate " job-rate " for running a test." )
(if new-best
(begin ;; found a host, return it
(debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate)
(host-last-used-set! rec curr-time)
new-best)
(if (null? tal) #f (loop (car tal)(cdr tal) best-host)))))))))
(define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f))
(let* ((loadavg (common:get-cpu-load remote-host))
(numcpus (if (< 1 numcpus-in) ;; not possible
(common:get-num-cpus remote-host)
numcpus-in))
(maxload (max maxload-in 0.5)) ;; so maxload must be greater than 0.5 for now BUG - FIXME?
|
︙ | | | ︙ | |
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
|
;; x86_64 zeus xena myth01
;; allhosts #{g hosts arm} #{g hosts x86_64}
;;
;; [host-types]
;; general #MTLOWESTLOAD #{g hosts allhosts}
;; arm #MTLOWESTLOAD #{g hosts arm}
;; nbgeneral nbjob run JOBCOMMAND -log $MT_LINKTREE/$MT_TARGET/$MT_RUNNAME.$MT_TESTNAME-$MT_ITEM_PATH.lgo
;;
;; [launchers]
;; envsetup general
;; xor/%/n 4C16G
;; % nbgeneral
;;
;; [jobtools]
|
>
>
>
>
>
>
|
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
|
;; x86_64 zeus xena myth01
;; allhosts #{g hosts arm} #{g hosts x86_64}
;;
;; [host-types]
;; general #MTLOWESTLOAD #{g hosts allhosts}
;; arm #MTLOWESTLOAD #{g hosts arm}
;; nbgeneral nbjob run JOBCOMMAND -log $MT_LINKTREE/$MT_TARGET/$MT_RUNNAME.$MT_TESTNAME-$MT_ITEM_PATH.lgo
;;
;; [host-rules]
;; # maxnload => max normalized load
;; # maxnjobs => max jobs per cpu
;; # maxjobrate => max jobs per second
;; general maxnload=1.1; maxnjobs=1.2; maxjobrate=0.1
;;
;; [launchers]
;; envsetup general
;; xor/%/n 4C16G
;; % nbgeneral
;;
;; [jobtools]
|
︙ | | | ︙ | |
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
|
(begin
(debug:print-info 2 *default-log-port* "Have flexi-launcher match for " testname "/" itempath " = " host-type)
(let ((launcher (configf:lookup configdat "host-types" host-type)))
(if launcher
(let* ((launcher-parts (string-split launcher))
(launcher-exe (car launcher-parts)))
(if (equal? launcher-exe "#MTLOWESTLOAD") ;; this is our special case, we will find the lowest load and craft a nbfake commandline
(let ((targ-host (common:get-least-loaded-host (cdr launcher-parts))))
(conc "remrun " targ-host))
launcher))
(begin
(debug:print-info 0 *default-log-port* "WARNING: no launcher found for host-type " host-type)
(if (null? tal)
fallback-launcher
(loop (car tal)(cdr tal)))))))
;; no match, try again
|
|
>
>
|
>
>
>
>
>
>
>
>
>
|
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
|
(begin
(debug:print-info 2 *default-log-port* "Have flexi-launcher match for " testname "/" itempath " = " host-type)
(let ((launcher (configf:lookup configdat "host-types" host-type)))
(if launcher
(let* ((launcher-parts (string-split launcher))
(launcher-exe (car launcher-parts)))
(if (equal? launcher-exe "#MTLOWESTLOAD") ;; this is our special case, we will find the lowest load and craft a nbfake commandline
(let host-loop ((targ-host (common:get-least-loaded-host (cdr launcher-parts) host-type configdat))
(count 100))
(if targ-host
(conc "remrun " targ-host)
(if (> count 0)
(begin
(debug:print 0 *default-log-port* "INFO: Waiting for a host for host-type " host-type)
(thread-sleep! (- 101 count))
(host-loop (common:get-least-loaded-host (cdr launcher-parts) host-type configdat)
(- count 1)))
(begin
(debug:print 0 *default-log-port* "FATAL: Failed to find a host from #MTLOWESTLOAD for host-type " host-type)
(exit)))))
launcher))
(begin
(debug:print-info 0 *default-log-port* "WARNING: no launcher found for host-type " host-type)
(if (null? tal)
fallback-launcher
(loop (car tal)(cdr tal)))))))
;; no match, try again
|
︙ | | | ︙ | |