Megatest

mtest-repair-lib.scm at [2a906e05b0]
Login

File utils/wip/mtest-repair-lib.scm artifact c317aec679 part of check-in 2a906e05b0


(use general-lib)
(use typed-records)
(use regex-literals)
(use regex)
(use sql-de-lite)
(use posix)
(use files)
(use s11n)
(use ports)
(use z3)
(use base64)
(use matchable)




(define (cli-arg arg #!key (default #f) (is-list #f))
  (let* ((temp    (skim-cmdline-opts-withargs-by-regex arg)))
      (if (> (length temp) 0)
          (if is-list
              temp
              (car temp))
          default)))

(define (cli-switch arg)
  (let ((temp (skim-cmdline-opts-noarg-by-regex arg)))
    (if (> (length temp) 0)
        (car temp)
        #f)))


         
(defstruct nbjob
    pool
    jobid
    owner
    mtver
    user
    status
    cmdline
    execute)

(define (cmdline->execute cmdline)
  (let* ((match (string-match ".*-execute\\s+(\\S+)" cmdline)))
    (if match
        (with-input-from-string (z3:decode-buffer (base64-decode (cadr match))) read)
        #f)))


(define (nbjob-execute-ref nbjob key #!key (default #f))
  (let ((execute (nbjob-execute nbjob)))
    (if (list? execute)
        (let* ((match (alist-ref key execute)))
          (if match
              (if (list? match) (car match) match)
              default))
        default)))

(define (nbjob-process pool nbstatus-line)
  (let ((toks (string-split nbstatus-line ",")))
    (if (eq? 4 (length toks))
        (if (equal? (list-ref toks 1) "Jobid")
            #f
            (begin
              (let ((res
                     (make-nbjob
                      pool:    pool
                      status:  (list-ref toks 0)
                      jobid:   (list-ref toks 1)
                      user:    (list-ref toks 2)
                      cmdline: (list-ref toks 3)
                      execute: (cmdline->execute (list-ref toks 3))
                      )))
                res)))
        #f)))

(define (get-mtest-nb-jobs user nbpools #!key (cmdline-filter "megatest"))
  (let* ((res
          (apply append
                 (map (lambda (pool)
                        (let* (;;(user-filter ".*")
                               (user-filter user)
                               (cmd
                                (conc "nbstatus jobs --tar "pool" --fields status,jobid,user,cmdline --format csv "
                                      "'USER=~\""user-filter
                                      "\"&&cmdline=~\""cmdline-filter"\"'"))
                               (res (do-or-die cmd)))
                          (filter nbjob?
                                  (map (lambda (line)
                                         (nbjob-process pool line))
                                       (string-split res "\n")))))
                      nbpools))))
    res))

;;(define foo (get-mtest-nb-jobs "bjbarcla" '("pdx_normal" "pdx_critical")))

(define (cmdline->execute cmdline)
  (let* ((match (string-match ".*-execute\\s+(\\S+)" cmdline)))
    (if match
        (with-input-from-string (z3:decode-buffer (base64-decode (cadr match))) read)
        #f)))

;;;; kill jobs in netbatch for this area
(define (kill-mtest-jobs-in-netbatch)
  (let ((pwd (get-environment-variable "PWD"))
        (jobs (get-mtest-nb-jobs (get-environment-variable "USER") '("pdx_normal" "pdx_critical") )))

    (for-each
     (lambda (job)
       (let* ((jobid  (nbjob-jobid job))
              (pool   (nbjob-pool job))
              (status (nbjob-status job))
              (cmd (conc "nbjob --target "pool" remove "jobid)))
         ;;(print status)
         (print  cmd)
         (system cmd)))
                                        ;(pp (nbjob->alist job))
     (filter
      (lambda (job)
        (equal? (nbjob-execute-ref job 'toppath) pwd))
      jobs))
    (length jobs)

    ))


;;;; kill megatest jobs in running in netbatch
(define (kill-in-db #!key (megatest_exe "megatest"))
  (let* ((all-targ-patt (do-or-die "sqlite3 megatest.db \"select id from keys\" | tr \"\\n1234567890\" \"/%%%%%%%%%%\" | sed 's/\\/$//'"))
         )
    (for-each (lambda (state)
                (let* ((cmd (conc megatest_exe " -state "state" -set-state-status KILLED,n/a -testpatt % -target "all-targ-patt" -runname %")))
                  (print cmd)
                  (system cmd)))
              '("REMOTEHOSTSTART" "LAUNCHED" "RUNNING" "KEEP_TRYING" "PREQ_FAIL"))))



;;;; kill megatests and dashboards in this area running on this host
(define (kill-mtest-dboard)
  
  (let* ((this-toppath (pid->cwd (current-process-id)))
         (tmppath      (toppath->tmppath this-toppath))
         (config       (let ((res (conc this-toppath "/megatest.config")))
                         (when (not (file-exists? res))
                           (ierr "This is not a megatest run area; "res" does not exist.  Aborting.")
                           (exit 2))
                         res))
         (mtest-procs (get-my-mtest-procs))
         (dashboard-procs (get-my-dashboard-procs))
         (all-pids (map proc-PID (append mtest-procs dashboard-procs)))
         (our-pids (filter (lambda (pid)
                             ;;(print (pid-COMMAND pid))
                             (and
                              (equal? (pid->cwd pid) this-toppath)
                              
                              ))
                           all-pids)))

    (if (null? our-pids)
        (inote "No mtest or dboard processes on this host in in this runarea.")
        (begin
          (iwarn "Killing all megatest and dashboard processes on this host.")
          (gracefully-kill-pids our-pids)))
    ))



(define (db-mt-version dbpath)
  (let* ((cmd (conc "sqlite3 "dbpath" 'select val from metadat where var=\"MEGATEST_VERSION\"'"))
         (res (do-or-die cmd)))
    res))
   
;; TODO
(define (db-islocked? dbpath)
  (let-values (((ec so se) (isys (conc "sqlite3 "dbpath" vacuum"))))
    (let* ((message se)
           (is-locked (string-match "^.*database is locked.*$" message)))
      (inote "dbfile - "dbpath "; message - "message)
      is-locked)))

(define (db-unlock dbpath)
  (system (conc "/nfs/site/bjbarcla/bin/unlock_db.sh " dbpath))

  ;; (let* ((temp-dbpath (conc "/tmp/"(get-environment-variable "USER")"-"(current-process-id)".db")))
  ;;   (inote "Unlocking "dbpath)
  ;;   (do-or-die (conc "cp "dbpath" "temp-dbpath))
  ;;   (do-or-die (conc "rm -f "dbpath))
  ;;   (let* ((cmd (conc "sqlite3 "temp-dbpath" .dump | sqlite3 "dbpath)))
  ;;     (inote "Running: "cmd)
  ;;     (system cmd))
  ;;   ;;(do-or-die "sqlite3 "temp-dbpath" .dump | sqlite3 "dbpath)
  ;;   (if (db-islocked? dbpath)
  ;;       (begin
  ;;         (ierr "Could not unlock "dbpath)
  ;;         (exit 5))
  ;;       (inote "Unlocked "dbpath))
  ;;   #t)

  )


(define *user* (do-or-die "ls -ld . | awk '{print $3}'"))

(define (false-on-exception thunk)
  (handle-exceptions exn #f (thunk) ))

(define (safe-file-exists? path-string)
  (false-on-exception (lambda () (file-exists? path-string))))

(defstruct proc
    (USER "")
  (PID -1)
  (%CPU -1.0)
  (%MEM -1.0)
  (VSZ -1)
  (RSS -1)
  (TTY "")
  (STAT "")
  (START "")
  (TIME "")
  (COMMAND ""))

(define (toppath->tmppath toppath)
  (let* ((user *user*)
         (area (car (string-split
                     (car (reverse (string-split toppath "/")))
                     ".")))
         (dotified-path (string-substitute "/" "." toppath "all")))
    (conc "/tmp/" user "/megatest_localdb/" area "/" dotified-path)))


(define (delete-db dbfile)
  (let* ((db-files (glob (conc dbfile "*"))))
    (for-each
     (lambda (file)
       (inote "delete file " file)
       (if (delete-file* file)
           (inote "Removed file - " file)
           (iwarn "Could Not Remove file - " file))
       )
     db-files)))

(define (check-db dbfile)
  (let* ((has-wal (safe-file-exists? (conc dbfile "-wal")))
         (has-shm (safe-file-exists? (conc dbfile "-shm")))
         (has-journal (safe-file-exists? (conc dbfile "-journal")))
         (has-db (safe-file-exists? dbfile))
         (ok-flag #t))
    (when has-journal
      (iwarn "Journal exists - "(conc dbfile "-journal"))
      )
    (when has-wal
      (set! ok-flag #f)
      (iwarn "Wal-mode db exists: "(conc dbfile "-wal")))
    (if (not has-db)
        (begin
          (inote "db does not exists " dbfile)
          (set! ok-flag #f))
        (let* ((db-size (file-size dbfile)))
          (inote "db size = " db-size " -- " dbfile)
          (when (member db-size (list 0 1024))
            (iwarn "db has bad size - "db-size" -- "dbfile)
            (set! ok-flag #f))))
    ok-flag))
               

(define (linux-get-process-info-records)
  (let* ((raw (do-or-die "/bin/ps auwx"))
         (all-lines (string-split raw "\n"))
         (lines (cdr all-lines)) ;; skip title lines
         (re (regexp "^(\\S+)\\s+(\\d+)\\s+(\\S+)\\s+(\\S+)\\s+(\\S+)\\s+(\\S+)\\s+(\\S+)\\s+(\\S+)\\s+(\\S+)\\s+(\\S+)\\s+(.*)$")))
    (filter
     proc?
     (map
      (lambda (line)
        (let ((match (string-match re line)))
          (if match
              (make-proc
               USER:    (list-ref match 1)
               PID:     (string->number (list-ref match 2))
               %CPU:    (string->number (list-ref match 3))
               %MEM:    (string->number (list-ref match 4))
               VSZ:     (string->number (list-ref match 5))
               RSS:     (string->number (list-ref match 6))
               TTY:     (string->number (list-ref match 7))
               STAT:    (list-ref match 8)
               START:   (list-ref match 9)
               TIME:    (list-ref match 10)
               COMMAND: (list-ref match 11))
              #f)))
      lines))))
        
(define (get-my-mtest-server-procs)
  (let* ((procs (linux-get-process-info-records))
        (my-mtest-procs
         (filter
          (lambda (a-proc)
            (and
             (equal? *user* (proc-USER a-proc))
             (string-match "^.*/mtest\\s+.*-server.*" (proc-COMMAND a-proc))))
          procs)))
    my-mtest-procs))

(define (get-my-mtest-procs)
  (let* ((procs (linux-get-process-info-records))
        (my-mtest-procs
         (filter
          (lambda (a-proc)
            (and
             (equal? *user* (proc-USER a-proc))
             (string-match "^.*/m(ega)?test .*" (proc-COMMAND a-proc))
             (not (string-match "^.*/mtest-repair.*" (proc-COMMAND a-proc)))))
          procs)))
    my-mtest-procs))

(define (get-my-dashboard-procs)
  (let* ((procs (linux-get-process-info-records))
        (my-dboard-procs
         (filter
          (lambda (a-proc)
            (and
             (equal? *user* (proc-USER a-proc))
             (string-match "^.*/dboard.*" (proc-COMMAND a-proc))))
          procs)))
    my-dboard-procs))


(define (pid->environ-hash pid)
  (let* ((envfile (conc "/proc/"pid"/environ"))
         (ht (make-hash-table))
         (rawdata (with-input-from-file envfile read-string))
         (lines (string-split rawdata  (make-string 1 #\nul ))))
    (for-each
     (lambda (line)
       (let ((match (string-match "(^[^=]+)=(.*)" line)))
         (if match
             (hash-table-set! ht (list-ref match 1) (list-ref match 2)))))
     lines)
    ht))

(define (pid->cwd pid)
  (read-symbolic-link (conc "/proc/"pid"/cwd")))

(define (pid->mtest-monitor-db-file pid #!key (megatest_exe "megatest"))
  (let* ((env   (pid->environ-hash pid))
         (ltdir (hash-table-ref/default env "MT_LINKTREE" #f))
         (radir (hash-table-ref/default env "MT_RUN_AREA_HOME" #f))
         (cwd   (pid->cwd pid)))
    (let ((res
           (cond
            (ltdir (conc ltdir "/.db/monitor.db"))
            (radir (conc
                    (do-or-die
                     (conc megatest_exe " -start-dir "radir" -show-config -section setup -var linktree"))
                    "/.db/monitor.db"))
            (cwd  (conc
                   (do-or-die
                    (conc megatest_exe " -start-dir "cwd" -show-config -section setup -var linktree"))
                   "/.db/monitor.db"))
            
            (else #f))))
      res)))
      




(define (get-mdb-status mdb-file pid)
    ;; select state from servers where pid='4465';
  
  (cond
   ((not (string? mdb-file)) (conc "mdb-file could not be determined for pid " pid ">>"mdb-file ))
   ((not (safe-file-exists? mdb-file)) (conc "mdb-file does not exist for pid "pid" : "mdb-file))
   (else
    (let ((dbh (open-database mdb-file)))
      
      (set-busy-handler! dbh 10000)
      (let* ((sql-str "select state from servers where pid=?;")
             (stm (sql dbh sql-str))
             (alists (query fetch-alists stm (->string pid))))
        (if (null? alists)
            "server pid not in monitor.db"
            (cdr (car (car alists)))))))))

    
(define (mtest-server-pid->status pid)
  (let* ((mdb-file (pid->mtest-monitor-db-file pid)))
    (if mdb-file
        (get-mdb-status mdb-file pid)
        "no monitor.db file could be found"
        )))


(define (gracefully-kill-pids pids)
  (for-each (lambda (pid)
              (print "kill "pid)
              (system (conc "kill "pid)))
            pids)
  (sleep 5)
  (let* ((procs-left (linux-get-process-info-records))
         (pids-left (map proc-PID procs-left)))
    (for-each (lambda (pid)
                (when (member pid pids-left)
                  (print "kill -9"pid)
                  (system (conc "kill -9 "pid))))
              pids)))
  
            
              
(define (kill pid)
  (print "KILL "pid)
  (do-or-die (conc "kill -9 "pid)))

(define (reap-defunct-mtest-server-pid pid)
  (let ((status (mtest-server-pid->status pid)))
    (print pid"->"(mtest-server-pid->status pid))
    (if (member status (list "running" "dbprep" "available" "collision"))
        (print "pid="pid" in status "status" -- not killing")
        (kill pid))))
        
;; (let* ((procs (get-my-mtest-server-procs))
;;        (pids (map proc-PID procs))
;;        )
  
;;   (for-each reap-defunct-mtest-server-pid pids))



(define (mtdbver->mtrelver mtdbver)
  (let* ((table-alist '(
                        ("1.5402" . "1.54/02")
                        ("1.5406" . "1.54/05")
                        ("1.5408" . "1.54/07")
                        ("1.5409" . "1.54/09")
                        ("1.5412" . "1.54/12")
                        ("1.5413" . "1.54/13")
                        ("1.5414" . "1.54/14")
                        ("1.5415" . "1.54/15")
                        ("1.5416" . "1.54/16")
                        ("1.5417" . "1.54/17")
                        ("1.5418" . "1.54/18")
                        ("1.5419" . "1.54/19")
                        ("1.5421" . "1.54/21")
                        ("1.5411" . "1.54/support-for-skip")
                        ("1.5522" . "1.55/22")
                        ("1.5523" . "1.55/23")
                        ("1.5524" . "1.55/24")
                        ("1.5525" . "1.55/25")
                        ("1.6001" . "1.60/01")
                        ("1.6002" . "1.60/02")
                        ("1.6003" . "1.60/03")
                        ("1.6004" . "1.60/04")
                        ("1.6005" . "1.60/05")
                        ("1.6006" . "1.60/06")
                        ("1.6007" . "1.60/07")
                        ("1.6008" . "1.60/08")
                        ("1.6009" . "1.60/09")
                        ("1.6009" . "1.60/11")
                        ("1.6012" . "1.60/12")
                        ("1.6013" . "1.60/13")
                        ("1.6014" . "1.60/14")
                        ("1.6015" . "1.60/15")
                        ("1.6016" . "1.60/16")
                        ("1.6017" . "1.60/17")
                        ("1.6018" . "1.60/18")
                        ("1.6019" . "1.60/19")
                        ("1.6021" . "1.60/21")
                        ("1.6022" . "1.60/22")
                        ("1.6023" . "1.60/23")
                        ("1.6024" . "1.60/24")
                        ("1.6025" . "1.60/25")
                        ("1.6026" . "1.60/26")
                        ("1.6027" . "1.60/27")
                        ("1.6028" . "1.60/28")
                        ;;("1.6029" . "1.60/29")
                        ("1.6029" . "1.60/29a")
                        ("1.6031" . "1.60/31")
                        ("1.6101" . "1.61/01")
                        ("1.6101" . "1.61/01a")
                        ("1.6102-c2ba" . "1.61/02")
                        ("1.6103-3e88" . "1.61/03")
                        ("1.6104-ee53" . "1.61/04")
                        ("1.6105-232b" . "1.61/05")
                        ("1.6201-e652" . "1.62/01")
                        ("1.6204-c74d" . "1.62/04")
                        ("1.6205-aff0" . "1.62/05")
                        ("1.6207-6f59" . "1.62/07")
                        ("1.6301-fbf0" . "1.63/01")
                        ("1.6302-da4a" . "1.63/02")
                        ("1.6303-aa5f" . "1.63/03")
                        ("1.6304-fa49" . "1.63/04")
                        ("1.6305-a03b" . "1.63/05")
                        ("1.6306-7a12" . "1.63/06")
                        ("1.6307-fb5d" . "1.63/07")
                        ("1.6308-35e0" . "1.63/08")
                        ("1.6309-738c" . "1.63/09")
                        ("1.6309-880c" . "1.63/09a")
                        ("1.6309-b566" . "1.63/09b")
                        ("1.6311-fb43" . "1.63/11")
                        ("1.6311-fb43" . "1.63/11b")
                        ("1.6311-8a6c" . "1.63/11b")
                        ("1.6402-03c5" . "1.64/02")
                        )
                      )
         (res (alist-ref mtdbver table-alist equal?)))
    res))