babymaker  e95a6a9342d4604277fe7cc6149b6b5b24447d89
resubmit.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 import os, sys, re
3 import glob
4 import string
5 from pprint import pprint
6 import ROOT
7 import argparse
8 
9 parser = argparse.ArgumentParser()
10 parser.add_argument("-p","--logpath")
11 parser.add_argument("-t","--timestamp")
12 args = parser.parse_args()
13 
14 if (args.timestamp):
15  timestamp = args.timestamp
16 elif (args.logpath):
17  timestamp = args.logpath.rstrip("/").split("/")[-1]
18 else:
19  sys.exit("Please provide a timestamp either as, e.g. 151019_011440, or a path ending with the timestamp")
20 
21 onePerJob = False
22 
23 # determine if job is running at UCSB or UCSD
24 host = os.environ.get("HOSTNAME")
25 atUCSB = False
26 if "compute" in host or "physics.ucsb.edu" in host:
27  atUCSB = True
28 
29 # redirector = "root://cmsxrootd.fnal.gov//"
30 redirector = "root://cms-xrd-global.cern.ch//"
31 
32 bdir = os.getcwd()
33 if ('babymaker' not in bdir.split("/").pop()):
34  sys.exit("Execute from babymaker directory")
35 
36 logdir = os.path.join(bdir,'logs',timestamp)
37 if not os.path.exists(logdir):
38  sys.exit("Can't find log directory %s" %logdir)
39 
40 arxivdir = os.path.join(bdir,'logs',timestamp,'arxiv')
41 if not os.path.exists(arxivdir):
42  os.mkdir(arxivdir)
43 
44 #loglist = [x for x in glob.glob(logdir+"/*.log") if "_rs" in x]
45 loglist = [x for x in glob.glob(logdir+"/*.log")]
46 print "Found %i logs" %(len(loglist))
47 
48 failed = set()
49 unfinished = set()
50 xrootd_err = set()
51 
52 for flog in loglist:
53  ferr = flog.rstrip(".log") + ".err"
54  fout = flog.rstrip(".log") + ".out"
55  bname = flog.split("/").pop().rstrip(".log")
56  logfile = open(flog).read()
57  if "Job was aborted by the user" in logfile:
58  failed.add(bname)
59  continue
60  if os.path.getsize(fout)==0:
61  # failed.add(bname)
62  unfinished.add(bname)
63  else:
64  if "BABYMAKER: Written" not in open(fout).read():
65  failed.add(bname)
66  errfile = open(ferr).read()
67  # transfer not necessary at UCSB
68  if "Transfer took" not in errfile and not atUCSB:
69  failed.add(bname)
70  if "cmsRun exit code 1" in errfile:
71  failed.add(bname)
72  if "Fatal Exception" in errfile:
73  failed.add(bname)
74  if "Socket error while handshaking: [FATAL] Auth failed" in errfile:
75  xrootd_err.add(bname)
76 
77  if bname in xrootd_err and bname not in failed:
78  print "xrootd err but success(?): ",bname
79 
80 if len(unfinished) > 0 :
81  print "--------- Unfinished:"
82  pprint(unfinished)
83  print "--------- Total unfinished ",len(unfinished),"\n"
84 if len(failed) > 0 :
85  print "--------- Failed:"
86  pprint(failed)
87  print "Total unfinished ",len(unfinished)
88  print "Total failed ",len(failed)
89  print "Total with xrootd err",len(xrootd_err)
90 else :
91  if len(unfinished) == 0 : sys.exit("\nCongrats, no jobs failed. You might be able to go out and enjoy the mountains now :o)\n")
92  else : sys.exit("\nCongrats, no jobs failed, but still "+str(len(unfinished))+" jobs to go\n")
93 
94 user_input = raw_input('Resubmit jobs [y/N]?')
95 if (user_input!='y'):
96  sys.exit("Bye.")
97 else:
98  user_input = raw_input('Resubmit with one file per job [y/N]?')
99  if (user_input=='y'):
100  onePerJob = True
101 
102 # --- resubmission
103 total_jobs = 0
104 for old_baby in failed:
105  fexe = os.path.join(logdir.replace("/logs/","/run/"), old_baby+".sh")
106  fcmd = os.path.join(logdir.replace("/logs/","/run/"), old_baby+".cmd")
107  os.rename(logdir+"/"+old_baby+".log", arxivdir+"/"+old_baby+".log")
108  os.rename(logdir+"/"+old_baby+".err", arxivdir+"/"+old_baby+".err")
109  os.rename(logdir+"/"+old_baby+".out", arxivdir+"/"+old_baby+".out")
110  if not onePerJob:
111  old_exe = open(fexe).read()
112  new_exe = old_exe.replace("file:/hadoop/cms/phedex", redirector)
113  with open(fexe,'w') as f: f.write(new_exe)
114  sys_cmd = "condor_submit " + fcmd
115  if atUCSB: sys_cmd = "ssh cms25.physics.ucsb.edu condor_submit " + fcmd
116  print "INFO: Submitting", fcmd
117  os.system(sys_cmd)
118  total_jobs = total_jobs + 1
119  else:
120  tags = old_baby.split("_")
121  batch = int(tags[-1].strip("batch"))
122  nfiles = int(tags[-2].strip("mf"))
123 
124  # --- read the old submission files
125  if (not os.path.exists(fexe)) or (not os.path.exists(fcmd)):
126  sys.exit("Cannot find either .sh or .cmd for: %s" % logdir)
127  old_exe = open(fexe).readlines()
128  old_cmd = open(fcmd).read()
129 
130  # --- parse for input files
131  inputfiles = [line for line in old_exe if ("/store/" in line and "lcg-cp" not in line)]
132  for i, line in enumerate(inputfiles):
133  inputfiles[i] = "/store/"+line.split("/store/").pop().split(".root")[0]+".root"
134  # pprint(inputfiles)
135 
136  # --- assume none are local in case job failed because file was not found
137  for i, infile in enumerate(inputfiles):
138  new_baby = old_baby + "_rs"+str(i)
139  new_exe_lines = [line for line in old_exe if ("/store/" not in line or "lcg-cp" in line)]
140  ind = new_exe_lines.index('inputFiles=\\\n')
141  new_exe_lines.insert(ind+1, redirector + infile + ' \\\n')
142  new_exe = ''.join(new_exe_lines)
143  new_exe = new_exe.replace(old_baby, new_baby)
144  with open(fexe.replace(old_baby,new_baby),'w') as f1: f1.write(new_exe)
145  new_cmd = old_cmd.replace(old_baby, new_baby)
146  fnew_cmd = fcmd.replace(old_baby,new_baby)
147  with open(fnew_cmd,'w') as f2: f2.write(new_cmd)
148  sys_cmd = "condor_submit " + fnew_cmd
149  if atUCSB: sys_cmd = "ssh cms25.physics.ucsb.edu condor_submit " + fnew_cmd
150  print "INFO: Submitting", fnew_cmd
151  os.system(sys_cmd)
152  total_jobs = total_jobs + 1
153 
154 print("Submitted %i jobs." % total_jobs)
155 
156 # --- check if output files already exist
157 for old_baby in failed:
158  fexe = os.path.join(logdir.replace("/logs/","/run/"), old_baby+".sh")
159  old_exe = open(fexe).readlines()
160  for line in old_exe:
161  if ("SFN=") in line:
162  if ("T1tttt" in old_baby):
163  outputfile = line.split("SFN=").pop().strip("$i\n") + old_baby + ".root"
164  else:
165  outputfile = line.split("SFN=").pop().strip("\n")
166  if os.path.exists(outputfile):
167  user_input = raw_input('Remove output file %s corresponding to a failed job [Y/n]?' % outputfile)
168  if (user_input!='n'):
169  print "Removing file: ", outputfile
170  os.remove(outputfile)
171 
172