Overview
Comment: | Updated common to have a little less delay when checking loads since tquery is quite quick to check |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | v2.01-tquery | v2.01-try-1 |
Files: | files | file ages | folders |
SHA1: |
7b78f70ebe35f93ed572f15d5af5e566 |
User & Date: | jmoon18 on 2018-12-11 15:11:16 |
Other Links: | branch diff | manifest | tags |
Context
2018-12-11
| ||
15:11 | Updated common to have a little less delay when checking loads since tquery is quite quick to check Closed-Leaf check-in: 7b78f70ebe user: jmoon18 tags: v2.01-tquery, v2.01-try-1 | |
2018-12-06
| ||
09:52 | Initial version of tquery check-in: 06b3f69cfc user: jmoon18 tags: v2.01-tquery, v2.01-try-1 | |
Changes
Modified common.scm from [8a7bc69ba9] to [c80e9730c4].
︙ | ︙ | |||
1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 | (let* ((fullpath (conc *toppath* "/logs/" key "-" dtype ".log"))) (handle-exceptions exn #f (with-output-to-file fullpath (lambda ()(pp dat)))))) (define (common:get-cpu-load remote-host) (let ((al (common:get-normalized-cpu-load remote-host))) (list (alist-ref '1m-load al) (alist-ref '5m-load al) (alist-ref '15m-load al))) ;;(common:get-cpu-load-original remote-host) ) ;; get cpu load by reading from /proc/loadavg, return all three values ;; (define (common:get-cpu-load-original remote-host) (let* ((actual-hostname (or remote-host (get-host-name) "localhost"))) (or (common:get-cached-info actual-hostname "cpu-load") (let ((result (if remote-host | > > > > > > | 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 | (let* ((fullpath (conc *toppath* "/logs/" key "-" dtype ".log"))) (handle-exceptions exn #f (with-output-to-file fullpath (lambda ()(pp dat)))))) (define (common:get-cpu-load remote-host) (handle-exceptions exn (lambda() (list 50 50 50) ) (let ((al (common:get-normalized-cpu-load remote-host))) (list (alist-ref '1m-load al) (alist-ref '5m-load al) (alist-ref '15m-load al))) ;;(common:get-cpu-load-original remote-host) ) ) ;; get cpu load by reading from /proc/loadavg, return all three values ;; (define (common:get-cpu-load-original remote-host) (let* ((actual-hostname (or remote-host (get-host-name) "localhost"))) (or (common:get-cached-info actual-hostname "cpu-load") (let ((result (if remote-host |
︙ | ︙ | |||
1535 1536 1537 1538 1539 1540 1541 | (define (common:get-normalized-cpu-load remote-host) (if (file-exists? (pathname-expand "~/.megatest/tquery")) (begin (with-input-from-file (pathname-expand "~/.megatest/tquery") (lambda() (set! tqfilecontents (read-string)) )) | > > > > > | > | | < < < < < < < | 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 | (define (common:get-normalized-cpu-load remote-host) (if (file-exists? (pathname-expand "~/.megatest/tquery")) (begin (with-input-from-file (pathname-expand "~/.megatest/tquery") (lambda() (set! tqfilecontents (read-string)) )) (handle-exceptions exn (lambda() (sleep 1) (common:get-normalized-cpu-load remote-host) ) (set! tqfileparts (string-split (string-trim-both tqfilecontents) ":")) ) ;;(print "TQuery host: " (car tqfileparts)) ;;(print "TQuery port " (cadr tqfileparts)) ;;(print "Getting normalized cpu load for : " remote-host " via " (car tqfileparts) ":" (cadr tqfileparts)) ) (begin (process-run "nbfake /p/fdk/gwa/jmoon18/fossil/megatest/tquery -server -") (sleep 2) ) ) (handle-exceptions exn (lambda() ;;(print "Need to start tquery server here:") (process-run "nbfake /p/fdk/gwa/jmoon18/fossil/megatest/tquery -server -") (sleep 2) (common:get-normalized-cpu-load remote-host) ) (define-values (i o) (tcp-connect (car tqfileparts) (string->number (cadr tqfileparts)))) ;;(define-values (i o) (tcp-connect "plxcas102" 9000)) (write-line (conc "adj-cpuload-full:" (if remote-host remote-host (get-host-name))) o) ;;(write-line "adj-cpuload-full:plxcm5005" o) (with-input-from-string (read-line i) read) ) ) (define (common:get-normalized-cpu-load-original remote-host) (let ((res (common:get-normalized-cpu-load-raw-original remote-host)) (default `((adj-proc-load . 2) ;; there is no right answer (adj-core-load . 2) |
︙ | ︙ | |||
1767 1768 1769 1770 1771 1772 1773 | (if new-best (begin ;; found a host, return it (debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate) (host-last-used-set! rec curr-time) new-best) (if (null? tal) #f (loop (car tal)(cdr tal) best-host))))))))) | | | | > > > > > > | > | 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 | (if new-best (begin ;; found a host, return it (debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate) (host-last-used-set! rec curr-time) new-best) (if (null? tal) #f (loop (car tal)(cdr tal) best-host))))))))) (define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 100) (msg #f)(remote-host #f)(force-maxload #f)) (let* ((loadavg (common:get-cpu-load remote-host)) (numcpus (if (< 1 numcpus-in) ;; not possible (common:get-num-cpus remote-host) numcpus-in)) (maxload (if force-maxload maxload-in (max maxload-in 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME? (first (car loadavg)) (next (cadr loadavg)) (adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1 (loadjmp (- first next)) (adjwait (min (+ 300 (random 10)) (* (+ (random 10)(/ (- 100 count) 10) waitdelay) (- first adjload) ) ))) ;; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload ", load: " first ", adjload: " adjload ", loadjmp: " loadjmp " ,adjwait: " adjwait " ,numcpus: " numcpus ", loadjmp: " loadjmp) (cond ((and (> first adjload) (> count 0)) (debug:print-info 0 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg "")) (debug:print-info 1 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg "")) (thread-sleep! adjwait) (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)) ((and (> loadjmp numcpus) (> count 0)) (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to load jump " loadjmp " > numcpus " numcpus (if msg msg "")) (debug:print-info 1 *default-log-port* "waiting " adjwait " seconds due to load jump " loadjmp " > numcpus " numcpus (if msg msg "")) (thread-sleep! adjwait) (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))))) (define (common:wait-for-homehost-load maxload msg) (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local. #f (common:get-homehost))) (hh (if hh-dat (car hh-dat) #f)) (numcpus (common:get-num-cpus hh))) (common:wait-for-normalized-load maxload msg hh))) (define (common:get-num-cpus remote-host) (handle-exceptions exn (lambda() 2 ) (alist-ref 'core (common:get-normalized-cpu-load remote-host)) ) ) (define (common:get-num-cpus-orig remote-host) (let* ((actual-host (or remote-host (get-host-name)))) (or (common:get-cached-info actual-host "num-cpus" age: 86400) ;; hosts had better not be changing the number of cpus too often! (let* ((proc (lambda () (let loop ((numcpu 0) |
︙ | ︙ |