babymaker  e95a6a9342d4604277fe7cc6149b6b5b24447d89
get_flist.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 
3 import os, sys
4 import glob
5 import json
6 import string, pprint
7 import ROOT
8 import das_client as das
9 import argparse
10 
11 parser = argparse.ArgumentParser()
12 parser.add_argument("-d","--datasets")
13 parser.add_argument("-f","--files")
14 args = parser.parse_args()
15 
16 datasets = []
17 if (args.datasets):
18  datasets = args.datasets.split(",")
19 elif (args.files):
20  for fnm in args.files.split(","):
21  with open(fnm) as f:
22  datasets.extend([line for line in f.read().splitlines() if (len(line)>0 and line[0]=="/")])
23 
24 print "Processing datasets:"
25 pprint.pprint(datasets)
26 
27 # Parsing where the files can be found depends on whether we run on UCSB or UCSD
28 host = os.environ.get("HOSTNAME")
29 if "ucsd" in host: host = "sd"
30 elif "compute" in host: host = "sb"
31 elif "ucsb" in host: sys.exit("\033[91mERROR: To allow access to hadoop at UCSB use one of the compute-0-X machines. \033[0m")
32 elif "lxplus" in host: host = "lxplus"
33 else: sys.exit("\033[91mERROR: Unknown host: "+host+" Exit. \033[0m")
34 
35 hadoop = '/mnt/hadoop/cms'
36 if host=="sd": hadoop = '/hadoop/cms/phedex'
37 
38 # Directory to dump all condor-related logs, schell and cmd files
39 flistdir = os.path.join(os.getenv("CMSSW_BASE"),"src/flists/")
40 if not os.path.exists(flistdir):
41  sys.exit("ERROR: flists repository not found.")
42 
43 
44 for ds in datasets:
45  # parse the dataset name and guess the path on hadoop to create the input file list
46  path,dsname,campaign,reco = '','','',''
47  tags = string.split(ds,'/')
48  dsname = tags[1]
49  campaign = (string.split(tags[2],'-'))[0]
50  reco = tags[2][len(campaign)+1:]
51  filetype = tags[3]
52 
53  # query DAS; the result is a list of dictionaries; one dict per file
54  # each dictionary has the name, size and nevents of a file
55  # this is a per-dataset level query, so it's quick
56  print "INFO: Query DAS for files in:", '_'.join([dsname,campaign,reco])
57  this_fdicts = das.getFilesInfo(ds, wanted_keys = ['name','size','nevents'])
58  nfiles = len(this_fdicts)
59  if nfiles==0:
60  print "\033[93m WARNING: "+ds+" not found! Skip dataset. \033[0m"
61  continue
62 
63  fnm = flistdir+'/'+'_'.join(['flist',dsname,campaign,reco+'.txt'])
64  do_chmod = True
65  if os.path.exists(fnm): do_chmod = False
66 
67  f = open(fnm,"w")
68  nent_local = 0
69  nent = 0
70  nfiles_local = 0
71  for ifile in this_fdicts:
72  runlist = ''
73  if "Run2015" in ds:
74  # we need to ask DAS for what runs are in each file if it's reprocessed data
75  # for prompt reco, the run number can be parsed from the file path
76  if "PromptReco" in ds:
77  runlist = ifile['name'].split("/000/").pop().split("/00000/")[0].replace("/","")
78  else:
79  # this is a per-file query, so it's slow
80  runlist = ','.join([str(irun) for irun in das.getFileRunInfo(ifile['name'])])
81 
82  # check if file is available locally
83  if os.path.exists(hadoop+ifile['name']):
84  if (os.path.getsize(hadoop+ifile['name'])==ifile['size']):
85  if runlist=='': f.write("local " + '{:<10}'.format(ifile['nevents']) + ifile['name'] + '\n')
86  else: f.write("local " + '{:<10}'.format(ifile['nevents']) + ifile['name'] + " " + runlist + '\n')
87  nfiles_local = nfiles_local + 1
88  nent_local = nent_local + ifile['nevents']
89  else:
90  print "Encountered partial local file, will use xrootd instead"
91  if runlist=='': f.write("xrootd " + '{:<10}'.format(ifile['nevents']) + ifile['name'] + '\n')
92  else: f.write("xrootd " + '{:<10}'.format(ifile['nevents']) + ifile['name'] + " " + runlist + '\n')
93  else:
94  if runlist=='': f.write("xrootd " + '{:<10}'.format(ifile['nevents']) + ifile['name'] + '\n')
95  else: f.write("xrootd " + '{:<10}'.format(ifile['nevents']) + ifile['name'] + " " + runlist + '\n')
96  nent = nent + ifile['nevents']
97  f.write("nEventsTotal: "+'{:<10}'.format(nent)+ '\n')
98  f.write("stat events available locally: " + "%s%% %i/%i" % ('{:.0f}'.format(float(nent_local)/float(nent)*100.),nent_local,nent) + '\n')
99  f.write("stat files available locally: " + "%s%% %i/%i" % ('{:.0f}'.format(float(nfiles_local)/float(nfiles)*100.),nfiles_local,nfiles) + '\n')
100  f.close()
101  if (do_chmod): os.chmod(fnm, 0777)