Overview
Context
Changes
Modified megatest.scm
from [b300543c22]
to [16af3583d8].
︙ | | |
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
|
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
|
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
+
+
+
+
+
+
+
|
;; The watchdog is to keep an eye on things like db sync etc.
;;
(define *watchdog*
(make-thread
(lambda ()
(thread-sleep! 0.05) ;; delay for startup
(let ((legacy-sync (configf:lookup *configdat* "setup" "megatest-db")))
(let loop ()
;; sync for filesystem local db writes
;;
(let ((start-time (current-seconds))
(servers-started (make-hash-table)))
(for-each
(lambda (run-id)
(mutex-lock! *db-multi-sync-mutex*)
(if (hash-table-ref/default *db-local-sync* run-id #f)
;; (if (> (- start-time last-write) 5) ;; every five seconds
(begin ;; let ((sync-time (- (current-seconds) start-time)))
(db:multi-db-sync (list run-id) 'new2old)
(if (common:low-noise-print 30 "sync new to old")
(let ((sync-time (- (current-seconds) start-time)))
(debug:print-info 0 "Sync of newdb to olddb for run-id " run-id " completed in " sync-time " seconds")
(if (> sync-time 10) ;; took more than ten seconds, start a server for this run
(begin
(debug:print-info 0 "Sync is taking a long time, start up a server to assist for run " run-id)
(server:kind-run run-id)))))
(hash-table-delete! *db-local-sync* run-id)))
(mutex-unlock! *db-multi-sync-mutex*))
(hash-table-keys *db-local-sync*)))
(let loop ()
;; sync for filesystem local db writes
;;
(let ((start-time (current-seconds))
(servers-started (make-hash-table)))
(for-each
(lambda (run-id)
(mutex-lock! *db-multi-sync-mutex*)
(if (and legacy-sync
(hash-table-ref/default *db-local-sync* run-id #f))
;; (if (> (- start-time last-write) 5) ;; every five seconds
(begin ;; let ((sync-time (- (current-seconds) start-time)))
(db:multi-db-sync (list run-id) 'new2old)
(if (common:low-noise-print 30 "sync new to old")
(let ((sync-time (- (current-seconds) start-time)))
(debug:print-info 0 "Sync of newdb to olddb for run-id " run-id " completed in " sync-time " seconds")))
;; (if (> sync-time 10) ;; took more than ten seconds, start a server for this run
;; (begin
;; (debug:print-info 0 "Sync is taking a long time, start up a server to assist for run " run-id)
;; (server:kind-run run-id)))))
(hash-table-delete! *db-local-sync* run-id)))
(mutex-unlock! *db-multi-sync-mutex*))
(hash-table-keys *db-local-sync*)))
;; keep going unless time to exit
;;
(if (not *time-to-exit*)
(begin
(thread-sleep! 1) ;; wait one second before syncing again
(loop)))))
"Watchdog thread"))
;; keep going unless time to exit
;;
(if (not *time-to-exit*)
(begin
(thread-sleep! 1) ;; wait one second before syncing again
(loop)))))
"Watchdog thread")))
(thread-start! *watchdog*)
(if (args:get-arg "-log")
(let ((oup (open-output-file (args:get-arg "-log"))))
(debug:print-info 0 "Sending log output to " (args:get-arg "-log"))
(current-error-port oup)
|
︙ | | |
Modified rmt.scm
from [714450135c]
to [a4cf4136f4].
︙ | | |
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
|
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
|
-
-
-
-
+
+
+
-
-
+
-
-
+
-
-
-
-
-
-
-
-
-
-
-
+
|
;; no longer killing the server in http-transport:client-api-send-receive
;; may kill it here but what are the criteria?
;; start with three calls then kill server
(if (eq? attemptnum 3)(tasks:kill-server-run-id run-id))
(rmt:send-receive cmd run-id params attemptnum: (+ attemptnum 1)))))
(let ((max-avg-qry (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "10"))))
(if (and (< attemptnum 10)
(configf:lookup *configdat* "server" "required"))
(begin
(if (and (< attemptnum 10)
(tasks:need-server run-id))
(begin
(debug:print-info 0 "Server required mode, attempting to start server and retry query in ten seconds")
(server:kind-run run-id)
(tasks:start-and-wait-for-server (db:delay-if-busy (tasks:open-db)) run-id 10)
(thread-sleep! 10)
(rmt:send-receive cmd rid params (+ attemptnum 1)))
(rmt:send-receive cmd rid params (+ attemptnum 1)))
;; (if (rmt:write-frequency-over-limit? cmd run-id)(server:kind-run run-id))
(let* ((curr-max (rmt:get-max-query-average run-id))
(curr-max-val (cdr curr-max)))
(debug:print-info 4 "no server and read-only query, bypassing normal channel")
(if (> curr-max-val max-avg-qry)
(if (common:low-noise-print 10 "start server due to max average query too long")
(begin
(debug:print-info 0 "Max average query, " (inexact->exact (round curr-max-val)) "ms (" (car curr-max) ") exceeds " max-avg-qry "ms, try starting server ...")
(server:kind-run run-id))
(debug:print-info 3 "Max average query, " (inexact->exact (round curr-max-val)) "ms (" (car curr-max) ") below " max-avg-qry "ms, not starting server...")))
(rmt:open-qry-close-locally cmd run-id params)))))))
(rmt:open-qry-close-locally cmd run-id params)))))
(define (rmt:update-db-stats run-id rawcmd params duration)
(mutex-lock! *db-stats-mutex*)
(handle-exceptions
exn
(begin
(debug:print 0 "WARNING: stats collection failed in update-db-stats")
|
︙ | | |
Modified runs.scm
from [380bd479a2]
to [6ed325fc14].
︙ | | |
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
|
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
|
-
+
|
(all-tests-registry #f) ;; (tests:get-all)) ;; (tests:get-valid-tests (make-hash-table) test-search-path)) ;; all valid tests to check waiton names
(all-test-names #f) ;; (hash-table-keys all-tests-registry))
(test-names #f) ;; (tests:filter-test-names all-test-names test-patts))
(required-tests #f) ;;(lset-intersection equal? (string-split test-patts ",") test-names))) ;; test-names)) ;; Added test-names as initial for required-tests but that failed to work
(task-key (conc (hash-table->alist flags) " " (get-host-name) " " (current-process-id)))
(tdbdat (tasks:open-db)))
(tasks:start-and-wait-for-server tdbdat run-id 10)
(if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10))
(set-signal-handler! signal/int
(lambda (signum)
(signal-mask! signum)
(print "Received signal " signum ", cleaning up before exit. Please wait...")
(let ((tdbdat (tasks:open-db)))
(tasks:set-state-given-param-key (db:delay-if-busy tdbdat) task-key "killed"))
|
︙ | | |
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
|
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
|
-
+
+
-
-
+
+
|
(item-path (item-list->path itemdat))
(tfullname (runs:make-full-test-name test-name item-path))
(newtal (append tal (list hed)))
(regfull (>= (length reg) reglen))
(num-running (rmt:get-count-tests-running-for-run-id run-id)))
;; every couple minutes verify the server is there for this run
(if (common:low-noise-print 60 "try start server" run-id)
(if (and (common:low-noise-print 60 "try start server" run-id)
(tasks:need-server run-id))
(tasks:start-and-wait-for-server tdbdat run-id 10))
(if (> num-running 0)
(if (> num-running 0)
(set! last-time-some-running (current-seconds)))
(if (> (current-seconds)(+ last-time-some-running 240))
(hash-table-set! *max-tries-hash* tfullname (+ (hash-table-ref/default *max-tries-hash* tfullname 0) 1)))
;; (debug:print 0 "max-tries-hash: " (hash-table->alist *max-tries-hash*))
;; Ensure all top level tests get registered. This way they show up as "NOT_STARTED" on the dashboard
|
︙ | | |
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
|
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
|
-
+
-
+
|
'()))
(lasttpath "/does/not/exist/I/hope"))
(debug:print-info 4 "runs:operate-on run=" run ", header=" header)
(if (not (null? tests))
(begin
(case action
((remove-runs)
(tasks:start-and-wait-for-server tdbdat run-id 10)
(if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10))
;; seek and kill in flight -runtests with % as testpatt here
(if (equal? testpatt "%")
(tasks:kill-runner (db:delay-if-busy tdbdat) target run-name)
(debug:print 0 "not attempting to kill any run launcher processes as testpatt is " testpatt))
(debug:print 1 "Removing tests for run: " runkey " " (db:get-value-by-header run header "runname")))
((set-state-status)
(tasks:start-and-wait-for-server tdbdat run-id 10)
(if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10))
(debug:print 1 "Modifying state and staus for tests for run: " runkey " " (db:get-value-by-header run header "runname")))
((print-run)
(debug:print 1 "Printing info for run " runkey ", run=" run ", tests=" tests ", header=" header)
action)
((run-wait)
(debug:print 1 "Waiting for run " runkey ", run=" runnamepatt " to complete"))
(else
|
︙ | | |
Modified tasks.scm
from [a0ed950b28]
to [7c5174d4f3].
︙ | | |
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
|
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
|
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
+
+
|
(let ((res #f))
(sqlite3:for-each-row
(lambda (id)
(set! res id))
mdb ;; NEEDS dbprep ADDED
"SELECT id FROM servers WHERE run_id=? AND (state = 'running' OR (state = 'dbprep' AND (strftime('%s','now') - start_time) < 60));" run-id)
res))
(define (tasks:need-server run-id)
(let ((forced (configf:lookup *configdat* "server" "required"))
(maxqry (cdr (rmt:get-max-query-average run-id)))
(threshold (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "10"))))
(cond
(forced
(if (common:low-noise-print 60 run-id "server required is set")
(debug:print-info 0 "Server required is set, starting server."))
#t)
((> maxqry threshold)
(if (common:low-noise-print 60 run-id "Max query time execeeded")
(debug:print-info 0 "Max avg query time of " maxqry "ms exceeds limit of " threshold "ms, starting server."))
#t)
(else
#f))))
;; try to start a server and wait for it to be available
;;
(define (tasks:start-and-wait-for-server tdbdat run-id delay-max-tries)
;; ensure a server is running for this run
(let loop ((server-dat (tasks:get-server (db:delay-if-busy tdbdat) run-id))
(delay-time 0))
(if (and (not server-dat)
(< delay-time delay-max-tries))
(begin
(if (common:low-noise-print 60 "tasks:start-and-wait-for-server" run-id)(debug:print 0 "Try starting server for run-id " run-id))
(if (common:low-noise-print 60 "tasks:start-and-wait-for-server" run-id)
(debug:print 0 "Try starting server for run-id " run-id))
(server:kind-run run-id)
(thread-sleep! (min delay-time 5))
(loop (tasks:get-server (db:delay-if-busy tdbdat) run-id)(+ delay-time 1))))))
(define (tasks:get-all-servers mdb)
(let ((res '()))
(sqlite3:for-each-row
|
︙ | | |
Modified tests/fullrun/megatest.config
from [6b64710311]
to [79e8e68f6b].
︙ | | |
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
|
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
|
-
+
|
# timeout 0.025
timeout 0.01
# Server is required - slower but more resistant to Sqlite issues.
# required yes
# Start server when average query takes longer than this
server-query-threshold 15
server-query-threshold 55500
# daemonize yes
# hostname #{scheme (get-host-name)}
## disks are:
## name host:/path/to/area
## -or-
|
︙ | | |