5 from pprint
import pprint
9 parser = argparse.ArgumentParser()
10 parser.add_argument(
"-p",
"--logpath")
11 parser.add_argument(
"-t",
"--timestamp")
12 args = parser.parse_args()
15 timestamp = args.timestamp
17 timestamp = args.logpath.rstrip(
"/").split(
"/")[-1]
19 sys.exit(
"Please provide a timestamp either as, e.g. 151019_011440, or a path ending with the timestamp")
24 host = os.environ.get(
"HOSTNAME")
26 if "compute" in host
or "physics.ucsb.edu" in host:
30 redirector =
"root://cms-xrd-global.cern.ch//" 33 if (
'babymaker' not in bdir.split(
"/").pop()):
34 sys.exit(
"Execute from babymaker directory")
36 logdir = os.path.join(bdir,
'logs',timestamp)
37 if not os.path.exists(logdir):
38 sys.exit(
"Can't find log directory %s" %logdir)
40 arxivdir = os.path.join(bdir,
'logs',timestamp,
'arxiv')
41 if not os.path.exists(arxivdir):
45 loglist = [x
for x
in glob.glob(logdir+
"/*.log")]
46 print "Found %i logs" %(len(loglist))
53 ferr = flog.rstrip(
".log") +
".err" 54 fout = flog.rstrip(
".log") +
".out" 55 bname = flog.split(
"/").pop().rstrip(
".log")
56 logfile = open(flog).read()
57 if "Job was aborted by the user" in logfile:
60 if os.path.getsize(fout)==0:
64 if "BABYMAKER: Written" not in open(fout).read():
66 errfile = open(ferr).read()
68 if "Transfer took" not in errfile
and not atUCSB:
70 if "cmsRun exit code 1" in errfile:
72 if "Fatal Exception" in errfile:
74 if "Socket error while handshaking: [FATAL] Auth failed" in errfile:
77 if bname
in xrootd_err
and bname
not in failed:
78 print "xrootd err but success(?): ",bname
80 if len(unfinished) > 0 :
81 print "--------- Unfinished:" 83 print "--------- Total unfinished ",len(unfinished),
"\n" 85 print "--------- Failed:" 87 print "Total unfinished ",len(unfinished)
88 print "Total failed ",len(failed)
89 print "Total with xrootd err",len(xrootd_err)
91 if len(unfinished) == 0 : sys.exit(
"\nCongrats, no jobs failed. You might be able to go out and enjoy the mountains now :o)\n")
92 else : sys.exit(
"\nCongrats, no jobs failed, but still "+str(len(unfinished))+
" jobs to go\n")
94 user_input = raw_input(
'Resubmit jobs [y/N]?')
98 user_input = raw_input(
'Resubmit with one file per job [y/N]?')
104 for old_baby
in failed:
105 fexe = os.path.join(logdir.replace(
"/logs/",
"/run/"), old_baby+
".sh")
106 fcmd = os.path.join(logdir.replace(
"/logs/",
"/run/"), old_baby+
".cmd")
107 os.rename(logdir+
"/"+old_baby+
".log", arxivdir+
"/"+old_baby+
".log")
108 os.rename(logdir+
"/"+old_baby+
".err", arxivdir+
"/"+old_baby+
".err")
109 os.rename(logdir+
"/"+old_baby+
".out", arxivdir+
"/"+old_baby+
".out")
111 old_exe = open(fexe).read()
112 new_exe = old_exe.replace(
"file:/hadoop/cms/phedex", redirector)
113 with open(fexe,
'w')
as f: f.write(new_exe)
114 sys_cmd =
"condor_submit " + fcmd
115 if atUCSB: sys_cmd =
"ssh cms25.physics.ucsb.edu condor_submit " + fcmd
116 print "INFO: Submitting", fcmd
118 total_jobs = total_jobs + 1
120 tags = old_baby.split(
"_")
121 batch = int(tags[-1].strip(
"batch"))
122 nfiles = int(tags[-2].strip(
"mf"))
125 if (
not os.path.exists(fexe))
or (
not os.path.exists(fcmd)):
126 sys.exit(
"Cannot find either .sh or .cmd for: %s" % logdir)
127 old_exe = open(fexe).readlines()
128 old_cmd = open(fcmd).read()
131 inputfiles = [line
for line
in old_exe
if (
"/store/" in line
and "lcg-cp" not in line)]
132 for i, line
in enumerate(inputfiles):
133 inputfiles[i] =
"/store/"+line.split(
"/store/").pop().split(
".root")[0]+
".root" 137 for i, infile
in enumerate(inputfiles):
138 new_baby = old_baby +
"_rs"+str(i)
139 new_exe_lines = [line
for line
in old_exe
if (
"/store/" not in line
or "lcg-cp" in line)]
140 ind = new_exe_lines.index(
'inputFiles=\\\n')
141 new_exe_lines.insert(ind+1, redirector + infile +
' \\\n')
142 new_exe =
''.join(new_exe_lines)
143 new_exe = new_exe.replace(old_baby, new_baby)
144 with open(fexe.replace(old_baby,new_baby),
'w')
as f1: f1.write(new_exe)
145 new_cmd = old_cmd.replace(old_baby, new_baby)
146 fnew_cmd = fcmd.replace(old_baby,new_baby)
147 with open(fnew_cmd,
'w')
as f2: f2.write(new_cmd)
148 sys_cmd =
"condor_submit " + fnew_cmd
149 if atUCSB: sys_cmd =
"ssh cms25.physics.ucsb.edu condor_submit " + fnew_cmd
150 print "INFO: Submitting", fnew_cmd
152 total_jobs = total_jobs + 1
154 print(
"Submitted %i jobs." % total_jobs)
157 for old_baby
in failed:
158 fexe = os.path.join(logdir.replace(
"/logs/",
"/run/"), old_baby+
".sh")
159 old_exe = open(fexe).readlines()
162 if (
"T1tttt" in old_baby):
163 outputfile = line.split(
"SFN=").pop().strip(
"$i\n") + old_baby +
".root" 165 outputfile = line.split(
"SFN=").pop().strip(
"\n")
166 if os.path.exists(outputfile):
167 user_input = raw_input(
'Remove output file %s corresponding to a failed job [Y/n]?' % outputfile)
168 if (user_input!=
'n'):
169 print "Removing file: ", outputfile
170 os.remove(outputfile)