︙ | | |
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
|
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
|
+
+
+
+
+
+
+
+
+
+
+
-
+
+
-
-
+
+
|
;; GNU General Public License for more details.
;;
;; You should have received a copy of the GNU General Public License
;; along with Megatest. If not, see <http://www.gnu.org/licenses/>.
;;======================================================================
(declare (unit common))
;;======================================================================
;; MODULE STARTS HERE
;;======================================================================
(module common
*
(import chicken scheme data-structures extras srfi-13 ports )
(use srfi-1 data-structures posix regex-case (prefix base64 base64:)
format dot-locking csv-xml z3 ;; sql-de-lite
hostinfo md5 message-digest typed-records directory-utils stack
matchable regex posix (srfi 18) extras ;; tcp
(prefix nanomsg nmsg:)
nanomsg
(prefix sqlite3 sqlite3:)
pkts (prefix dbi dbi:)
srfi-69
)
(declare (unit common))
(include "common_records.scm")
(require-library stml)
;; (import stml)
;; (require-library margs)
;; (include "margs.scm")
;; (define old-exit exit)
;;
;; (define (exit . code)
|
︙ | | |
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
|
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
|
-
-
-
-
-
-
-
-
|
(define (common:get-full-version)
(conc megatest-version "-" megatest-fossil-hash))
(define (common:version-signature)
(conc megatest-version "-" (substring megatest-fossil-hash 0 4)))
;; from metadat lookup MEGATEST_VERSION
;;
(define (common:get-last-run-version) ;; RADT => How does this work in send-receive function??; assume it is the value saved in some DB
(rmt:get-var "MEGATEST_VERSION"))
(define (common:get-last-run-version-number)
(string->number
(substring (common:get-last-run-version) 0 6)))
(define (common:set-last-run-version)
(rmt:set-var "MEGATEST_VERSION" (common:version-signature)))
;; postive number if megatest version > db version
;; negative number if megatest version < db version
(define (common:version-db-delta)
(- megatest-version (common:get-last-run-version-number)))
(define (common:version-changed?)
(not (equal? (common:get-last-run-version)
|
︙ | | |
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
|
802
803
804
805
806
807
808
809
810
811
812
813
814
815
|
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
|
(else
(debug:print-info 13 *default-log-port* "loading writable-watchdog.")
(server:writable-watchdog dbstruct)))
(debug:print-info 13 *default-log-port* "watchdog done."))
(debug:print-info 13 *default-log-port* "no need for watchdog on non-homehost"))))
(define (std-exit-procedure)
(on-exit (lambda () 0))
;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*)
(let ((no-hurry (if *time-to-exit* ;; hurry up
#f
(begin
(set! *time-to-exit* #t)
#t))))
(debug:print-info 4 *default-log-port* "starting exit process, finalizing databases.")
(if (and no-hurry (debug:debug-mode 18))
(rmt:print-db-stats))
(let ((th1 (make-thread (lambda () ;; thread for cleaning up, give it five seconds
(if *dbstruct-db* (db:close-all *dbstruct-db*)) ;; one second allocated
(if *task-db*
(let ((db (cdr *task-db*)))
(if (sqlite3:database? db)
(begin
(sqlite3:interrupt! db)
(sqlite3:finalize! db #t)
;; (vector-set! *task-db* 0 #f)
(set! *task-db* #f)))))
(http-client#close-all-connections!)
;; (if (and *runremote*
;; (remote-conndat *runremote*))
;; (begin
;; (http-client#close-all-connections!))) ;; for http-client
(if (not (eq? *default-log-port* (current-error-port)))
(close-output-port *default-log-port*))
(set! *default-log-port* (current-error-port))) "Cleanup db exit thread"))
(th2 (make-thread (lambda ()
(debug:print 4 *default-log-port* "Attempting clean exit. Please be patient and wait a few seconds...")
(if no-hurry
(begin
(thread-sleep! 5)) ;; give the clean up few seconds to do it's stuff
(begin
(thread-sleep! 2)))
(debug:print 4 *default-log-port* " ... done")
)
"clean exit")))
(thread-start! th1)
(thread-start! th2)
(thread-join! th1)
)
)
0)
(define (std-signal-handler signum)
;; (signal-mask! signum)
(set! *time-to-exit* #t)
;;(debug:print-info 13 *default-log-port* "got signal "signum)
(debug:print-error 0 *default-log-port* "Received signal " signum " aaa exiting promptly")
;; (std-exit-procedure) ;; shouldn't need this since we are exiting and it will be called anyway
(exit))
|
︙ | | |
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
|
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
|
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
|
;; (print "NO MATCH: " hed)
(loop (car tal)(cdr tal) loads proc-num phys-num core-num)))))))))))
(define (common:unix-ping hostname)
(let ((res (system (conc "ping -c 1 " hostname " > /dev/null"))))
(eq? res 0)))
;; ideally put all this info into the db, no need to preserve it across moving homehost
;;
;; return list of
;; ( reachable? cpuload update-time )
(define (common:get-host-info hostname)
(let* ((loadinfo (rmt:get-latest-host-load hostname)) ;; if this host happens to have been recently used by a test reuse the load data
(load (car loadinfo))
(load-sample-time (cdr loadinfo))
(load-sample-age (- (current-seconds) load-sample-time))
(loadinfo-timeout-seconds 6) ;; this was 20 seconds, seems way too lax. Switch to 6 seconds
(host-last-update-timeout-seconds 4)
(host-rec (hash-table-ref/default *host-loads* hostname #f))
)
(cond
((< load-sample-age loadinfo-timeout-seconds)
(list #t
load-sample-time
load))
((and host-rec
(< (current-seconds) (+ (host-last-update host-rec) host-last-update-timeout-seconds)))
(list #t
(host-last-update host-rec)
(host-last-cpuload host-rec )))
((common:unix-ping hostname)
(list #t
(current-seconds)
(alist-ref 'adj-core-load (common:get-normalized-cpu-load hostname)))) ;; this is cheaper than you might think. get-normalized-cpu-load is cached for up to 5 seconds
(else
(list #f 0 -1) ;; bad host, don't use!
))))
;; see defstruct host at top of file.
;; host: reachable last-update last-used last-cpuload
;;
(define (common:update-host-loads-table hosts-raw)
(let* ((hosts (filter (lambda (x)
(string-match (regexp "^\\S+$") x))
hosts-raw)))
(for-each
(lambda (hostname)
(let* ((rec (let ((h (hash-table-ref/default *host-loads* hostname #f)))
(if h
h
(let ((h (make-host)))
(hash-table-set! *host-loads* hostname h)
h))))
(host-info (common:get-host-info hostname))
(is-reachable (car host-info))
(last-reached-time (cadr host-info))
(load (caddr host-info)))
(host-reachable-set! rec is-reachable)
(host-last-update-set! rec last-reached-time)
(host-last-cpuload-set! rec load)))
hosts)))
;; go through the hosts from least recently used to most recently used, pick the first that meets the load criteral from the
;; [host-rules] section.
;;
(define (common:get-least-loaded-host hosts-raw host-type configdat)
(let* ((rdat (configf:lookup configdat "host-rules" host-type))
(rules (common:val->alist (or rdat "") convert: #t)) ;; maxnload, maxnjobs, maxjobrate
(maxnload (common:alist-ref/default 'maxnload rules 1.5)) ;; max normalized load
(maxnjobs (common:alist-ref/default 'maxnjobs rules 1.5)) ;; max normalized number of jobs
(maxjobrate (common:alist-ref/default 'maxjobrate rules (/ 1 6))) ;; max rate of submitting jobs to a given host in jobs/second
(hosts (filter (lambda (x)
(string-match (regexp "^\\S+$") x))
hosts-raw))
;; (best-host #f)
(get-rec (lambda (hostname)
;; (print "get-rec hostname=" hostname)
(let ((h (hash-table-ref/default *host-loads* hostname #f)))
(if h
h
(let ((h (make-host)))
(hash-table-set! *host-loads* hostname h)
h)))))
(best-load 99999)
(curr-time (current-seconds))
(get-hosts-sorted (lambda (hosts)
(sort hosts (lambda (a b)
(let ((a-rec (get-rec a))
(b-rec (get-rec b)))
;; (print "a=" a " a-rec=" a-rec " host-last-used=" (host-last-used a-rec))
;; (print "b=" b " b-rec=" b-rec " host-last-used=" (host-last-used b-rec))
(< (host-last-used a-rec)
(host-last-used b-rec))))))))
(debug:print 0 *default-log-port* "INFO: hosts-sorted=" (get-hosts-sorted hosts))
(if (null? hosts)
#f ;; no hosts to select from. All done and giving up now.
(let ((hosts-sorted (get-hosts-sorted hosts)))
(common:update-host-loads-table hosts)
(let loop ((hostname (car hosts-sorted))
(tal (cdr hosts-sorted))
(best-host #f))
(let* ((rec (get-rec hostname))
(reachable (host-reachable rec))
(load (host-last-cpuload rec))
(last-used (host-last-used rec))
(delta (- curr-time last-used))
(job-rate (if (> delta 0)
(/ 1 delta)
999)) ;; jobs per second
(new-best
(cond
((not reachable)
(debug:print 0 *default-log-port* "Skipping host " hostname " as it cannot be reached.")
best-host)
((and (< load maxnload) ;; load is acceptable
(< job-rate maxjobrate)) ;; job rate is acceptable
(set! best-load load)
hostname)
(else best-host))))
(debug:print 0 *default-log-port* "INFO: Trying host " hostname " with load " load ", last used " delta " seconds ago, with job-rate " job-rate " for running a test." )
(if new-best
(begin ;; found a host, return it
(debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate)
(host-last-used-set! rec curr-time)
new-best)
(if (null? tal) #f (loop (car tal)(cdr tal) best-host)))))))))
(define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f))
(let* ((loadavg (common:get-cpu-load remote-host))
(numcpus (if (< 1 numcpus-in) ;; not possible
(common:get-num-cpus remote-host)
numcpus-in))
(maxload (max maxload-in 0.5)) ;; so maxload must be greater than 0.5 for now BUG - FIXME?
(first (car loadavg))
(next (cadr loadavg))
(adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1
(loadjmp (- first next))
(adjwait (+ (random 10)(/ (- 1000 count) 10) waitdelay))) ;; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously
(cond
((and (> first adjload)
(> count 0))
(debug:print-info 0 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg ""))
(thread-sleep! adjwait)
(common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
((and (> loadjmp numcpus)
(> count 0))
(debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to load jump " loadjmp " > numcpus " numcpus (if msg msg ""))
(thread-sleep! adjwait)
(common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)))))
(define (common:wait-for-homehost-load maxload msg)
(let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local.
#f
(common:get-homehost)))
(hh (if hh-dat (car hh-dat) #f))
(numcpus (common:get-num-cpus hh)))
(common:wait-for-normalized-load maxload msg hh)))
(define (common:get-num-cpus remote-host)
(let* ((actual-host (or remote-host (get-host-name))))
(or (common:get-cached-info actual-host "num-cpus" age: 86400) ;; hosts had better not be changing the number of cpus too often!
(let* ((proc (lambda ()
(let loop ((numcpu 0)
(inl (read-line)))
(if (eof-object? inl)
(begin
(common:write-cached-info remote-host "num-cpus" numcpu)
numcpu)
(loop (if (string-match "^processor\\s+:\\s+\\d+$" inl)
(+ numcpu 1)
numcpu)
(read-line))))))
(result (if remote-host
(with-input-from-pipe
(conc "ssh " remote-host " cat /proc/cpuinfo")
proc)
(with-input-from-file "/proc/cpuinfo" proc))))
(common:write-cached-info actual-host "num-cpus" result)
result))))
;; wait for normalized cpu load to drop below maxload
;;
(define (common:wait-for-normalized-load maxload msg remote-host)
(let ((num-cpus (common:get-num-cpus remote-host)))
(common:wait-for-cpuload maxload num-cpus 15 msg: msg remote-host: remote-host)))
(define (get-uname . params)
(let* ((uname-res (process:cmd-run->list (conc "uname " (if (null? params) "-a" (car params)))))
(uname #f))
(if (null? (car uname-res))
"unknown"
(caar uname-res))))
;; for reasons I don't understand multiple calls to real-path in parallel threads
;; must be protected by mutexes
;;
(define (common:real-path inpath)
;; (process:cmd-run-with-stderr->list "readlink" "-f" inpath)) ;; cmd . params)
;; (let-values
;; (((inp oup pid) (process "readlink" (list "-f" inpath))))
;; (with-input-from-port inp
;; (let loop ((inl (read-line))
;; (res #f))
;; (print "inl=" inl)
;; (if (eof-object? inl)
;; (begin
;; (close-input-port inp)
;; (close-output-port oup)
;; ;; (process-wait pid)
;; res)
;; (loop (read-line) inl))))))
(with-input-from-pipe (conc "readlink -f " inpath) read-line))
;;======================================================================
;; D I S K S P A C E
;;======================================================================
(define (common:get-disk-space-used fpath)
(with-input-from-pipe (conc "/usr/bin/du -s " fpath) read))
;; given path get free space, allows override in [setup]
;; with free-space-script /path/to/some/script.sh
;;
(define (get-df path)
(if (configf:lookup *configdat* "setup" "free-space-script")
(with-input-from-pipe
(conc (configf:lookup *configdat* "setup" "free-space-script") " " path)
(lambda ()
(let ((res (read-line)))
(if (string? res)
(string->number res)))))
(get-unix-df path)))
(define (get-unix-df path)
(let* ((df-results (process:cmd-run->list (conc "df " path)))
(space-rx (regexp "([0-9]+)\\s+([0-9]+)%"))
(freespc #f))
;; (write df-results)
(for-each (lambda (l)
(let ((match (string-search space-rx l)))
(if match
(let ((newval (string->number (cadr match))))
(if (number? newval)
(set! freespc newval))))))
(car df-results))
freespc))
(define (common:check-space-in-dir dirpath required)
(let* ((dbspace (if (directory? dirpath)
(get-df dirpath)
0)))
(list (> dbspace required)
dbspace
required
dirpath)))
;; check space in dbdir and in megatest dir
;; returns: ok/not dbspace required-space
;;
(define (common:check-db-dir-space)
(let* ((required (string->number
(or (configf:lookup *configdat* "setup" "dbdir-space-required")
"100000")))
(dbdir (common:get-db-tmp-area)) ;; (db:get-dbdir))
(tdbspace (common:check-space-in-dir dbdir required))
(mdbspace (common:check-space-in-dir *toppath* required)))
(sort (list tdbspace mdbspace) (lambda (a b)
(< (cadr a)(cadr b))))))
;; check available space in dbdir, exit if insufficient
;;
(define (common:check-db-dir-and-exit-if-insufficient)
(let* ((spacedat (car (common:check-db-dir-space))) ;; look only at worst for now
(is-ok (car spacedat))
(dbspace (cadr spacedat))
(required (caddr spacedat))
(dbdir (cadddr spacedat)))
(if (not is-ok)
(begin
(debug:print-error 0 *default-log-port* "Insufficient space in " dbdir ", require " required ", have " dbspace ", exiting now.")
(exit 1)))))
;; paths is list of lists ((name path) ... )
;;
(define (common:get-disk-with-most-free-space disks minsize)
(let ((best #f)
(bestsize 0))
(for-each
(lambda (disk-num)
(let* ((dirpath (cadr (assoc disk-num disks)))
(freespc (cond
((not (directory? dirpath))
(if (common:low-noise-print 300 "disks not a dir " disk-num)
(debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not a directory - ignoring it."))
-1)
((not (file-write-access? dirpath))
(if (common:low-noise-print 300 "disks not writeable " disk-num)
(debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not writeable - ignoring it."))
-1)
((not (eq? (string-ref dirpath 0) #\/))
(if (common:low-noise-print 300 "disks not a proper path " disk-num)
(debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not a fully qualified path - ignoring it."))
-1)
(else
(get-df dirpath)))))
(if (> freespc bestsize)
(begin
(set! best (cons disk-num dirpath))
(set! bestsize freespc)))))
(map car disks))
(if (and best (> bestsize minsize))
best
#f))) ;; #f means no disk candidate found
;; convert a spec string to a list of vectors #( rx action rx-string )
(define (common:spec-string->list-of-specs spec-string actions)
(let ((spec-strings (string-split-fields "\\s*;\\s*" spec-string #:infix))
(actions-regex (regexp (conc "^(.*)\\s+(" (string-intersperse (map conc actions) "|") ")"))))
(filter
(lambda (x) x)
(map (lambda (s)
|
︙ | | |
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
|
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
|
-
-
-
-
-
-
-
-
-
-
-
-
|
(proc (proc)))
(hash-table-for-each
vars
(lambda (var val)
(setenv var val)))
vars))
(define (common:run-a-command cmd #!key (with-vars #f))
(let* ((pre-cmd (dtests:get-pre-command))
(post-cmd (dtests:get-post-command))
(fullcmd (if (or pre-cmd post-cmd)
(conc pre-cmd cmd post-cmd)
(conc "viewscreen " cmd))))
(debug:print-info 02 *default-log-port* "Running command: " fullcmd)
(if with-vars
(common:without-vars cmd)
(common:without-vars fullcmd "MT_.*"))))
;;======================================================================
;; T I M E A N D D A T E
;;======================================================================
;; Convert strings like "5s 2h 3m" => 60x60x2 + 3x60 + 5
(define (common:hms-string->seconds tstr)
(let ((parts (string-split-fields "\\w+" tstr))
|
︙ | | |
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
|
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
|
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
|
(string-intersperse
(map (lambda (x)
(number->string x 16))
(map string->number
(string-split instr)))
"/"))
;;======================================================================
;; L O C K I N G M E C H A N I S M S
;;======================================================================
;; faux-lock is deprecated. Please use simple-lock below
;;
(define (common:faux-lock keyname #!key (wait-time 8)(allow-lock-steal #t))
(if (rmt:no-sync-get/default keyname #f) ;; do not be tempted to compare to pid. locking is a one-shot action, if already locked for this pid it doesn't actually count
(if (> wait-time 0)
(begin
(thread-sleep! 1)
(if (eq? wait-time 1) ;; only one second left, steal the lock
(begin
(debug:print-info 0 *default-log-port* "stealing lock for " keyname)
(common:faux-unlock keyname force: #t)))
(common:faux-lock keyname wait-time: (- wait-time 1)))
#f)
(begin
(rmt:no-sync-set keyname (conc (current-process-id)))
(equal? (conc (current-process-id)) (conc (rmt:no-sync-get/default keyname #f))))))
(define (common:faux-unlock keyname #!key (force #f))
(if (or force (equal? (conc (current-process-id)) (conc (rmt:no-sync-get/default keyname #f))))
(begin
(if (rmt:no-sync-get/default keyname #f) (rmt:no-sync-del! keyname))
#t)
#f))
;; simple lock. improve and converge on this one.
;;
(define (common:simple-lock keyname)
(rmt:no-sync-get-lock keyname))
;;======================================================================
;;
;;======================================================================
(define (common:in-running-test?)
(and (args:get-arg "-execute") (get-environment-variable "MT_CMDINFO")))
|
︙ | | |
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
|
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
|
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
|
;; ;; get registered dashboards
;; ;;
;; (define (mddb:get-dashboards)
;; (let ((db (mddb:open-db)))
;; (query fetch-column
;; (sql db "SELECT ipaddr || ':' || portnum FROM dashboards;"))))
;;======================================================================
;; T E S T L A U N C H I N G P E R I T E M W I T H H O S T T Y P E S
;;======================================================================
;;
;; [hosts]
;; arm cubie01 cubie02
;; x86_64 zeus xena myth01
;; allhosts #{g hosts arm} #{g hosts x86_64}
;;
;; [host-types]
;; general #MTLOWESTLOAD #{g hosts allhosts}
;; arm #MTLOWESTLOAD #{g hosts arm}
;; nbgeneral nbjob run JOBCOMMAND -log $MT_LINKTREE/$MT_TARGET/$MT_RUNNAME.$MT_TESTNAME-$MT_ITEM_PATH.lgo
;;
;; [host-rules]
;; # maxnload => max normalized load
;; # maxnjobs => max jobs per cpu
;; # maxjobrate => max jobs per second
;; general maxnload=1.1; maxnjobs=1.2; maxjobrate=0.1
;;
;; [launchers]
;; envsetup general
;; xor/%/n 4C16G
;; % nbgeneral
;;
;; [jobtools]
;; # if defined and not "no" flexi-launcher will bypass "launcher" unless no match.
;; flexi-launcher yes
;; launcher nbfake
;;
(define (common:get-launcher configdat testname itempath)
(let ((fallback-launcher (configf:lookup configdat "jobtools" "launcher")))
(if (and (configf:lookup configdat "jobtools" "flexi-launcher") ;; overrides launcher
(not (equal? (configf:lookup configdat "jobtools" "flexi-launcher") "no")))
(let* ((launchers (hash-table-ref/default configdat "launchers" '())))
(if (null? launchers)
fallback-launcher
(let loop ((hed (car launchers))
(tal (cdr launchers)))
(let ((patt (car hed))
(host-type (cadr hed)))
(if (tests:match patt testname itempath)
(begin
(debug:print-info 2 *default-log-port* "Have flexi-launcher match for " testname "/" itempath " = " host-type)
(let ((launcher (configf:lookup configdat "host-types" host-type)))
(if launcher
(let* ((launcher-parts (string-split launcher))
(launcher-exe (car launcher-parts)))
(if (equal? launcher-exe "#MTLOWESTLOAD") ;; this is our special case, we will find the lowest load and craft a nbfake commandline
(let host-loop ((targ-host (common:get-least-loaded-host (cdr launcher-parts) host-type configdat))
(count 100))
(if targ-host
(conc "remrun " targ-host)
(if (> count 0)
(begin
(debug:print 0 *default-log-port* "INFO: Waiting for a host for host-type " host-type)
(thread-sleep! (- 101 count))
(host-loop (common:get-least-loaded-host (cdr launcher-parts) host-type configdat)
(- count 1)))
(begin
(debug:print 0 *default-log-port* "FATAL: Failed to find a host from #MTLOWESTLOAD for host-type " host-type)
(exit)))))
launcher))
(begin
(debug:print-info 0 *default-log-port* "WARNING: no launcher found for host-type " host-type)
(if (null? tal)
fallback-launcher
(loop (car tal)(cdr tal)))))))
;; no match, try again
(if (null? tal)
fallback-launcher
(loop (car tal)(cdr tal))))))))
fallback-launcher)))
;;======================================================================
;; NMSG AND NEW API
;;======================================================================
;; nm based server experiment, keep around for now.
;;
(define (nm:start-server dbconn #!key (given-host-name #f))
#;(define (nm:start-server dbconn #!key (given-host-name #f))
(let* ((srvdat (start-raw-server given-host-name: given-host-name))
(host-name (srvdat-host srvdat))
(soc (srvdat-soc srvdat)))
;; start the queue processor (save for second round of development)
;;
(thread-start! (make-thread! (lambda ()(queue-processor dbconn) "Queue processor")))
|
︙ | | |
2929
2930
2931
2932
2933
2934
2935
|
2480
2481
2482
2483
2484
2485
2486
2487
2488
|
+
+
|
(if thread
(handle-exceptions
exn
#t ;; just ignore it, it might have died in the meantime so joining it will throw an exception
(thread-join! thread))
)))
(hash-table-keys *common:thread-punchlist*)))
)
|