babymaker  e95a6a9342d4604277fe7cc6149b6b5b24447d89
das_client.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 import os, sys, time, re
3 import json
4 import urllib, urllib2, httplib, cookielib
5 import pprint
6 from optparse import OptionParser
7 from types import GeneratorType
8 
9 # Data retrieval function taken/adapted from the DAS command line tool: https://cmsweb.cern.ch/das/cli
10 DAS_CLIENT = 'das-client/1.1::python/%s.%s' % sys.version_info[:2]
11 def get_data(query):
12  """Contact DAS server and retrieve data for given DAS query"""
13  params = {'input':query, 'idx':0, 'limit':0}
14  path = '/das/cache'
15  url = "https://cmsweb.cern.ch" + path
16  client = '%s (%s)' % (DAS_CLIENT, os.environ.get('USER', ''))
17  headers = {"Accept": "application/json", "User-Agent": client}
18  encoded_data = urllib.urlencode(params, doseq=True)
19  url += '?%s' % encoded_data
20  req = urllib2.Request(url=url, headers=headers)
21  http_hdlr = urllib2.HTTPHandler(debuglevel=0)
22  proxy_handler = urllib2.ProxyHandler({})
23  cookie_jar = cookielib.CookieJar()
24  cookie_handler = urllib2.HTTPCookieProcessor(cookie_jar)
25  opener = urllib2.build_opener(http_hdlr, proxy_handler, cookie_handler)
26  fdesc = opener.open(req)
27  data = fdesc.read()
28  fdesc.close()
29 
30  pat = re.compile(r'^[a-z0-9]{32}')
31  if data and isinstance(data, str) and pat.match(data) and len(data) == 32: pid = data
32  else: pid = None
33  iwtime = 2 # initial waiting time in seconds
34  wtime = 20 # final waiting time in seconds
35  sleep = iwtime
36  time0 = time.time()
37  while pid:
38  params.update({'pid':data})
39  encoded_data = urllib.urlencode(params, doseq=True)
40  url = "https://cmsweb.cern.ch" + path + '?%s' % encoded_data
41  req = urllib2.Request(url=url, headers=headers)
42  try:
43  fdesc = opener.open(req)
44  data = fdesc.read()
45  fdesc.close()
46  except urllib2.HTTPError as err:
47  return {"status":"fail", "reason":str(err)}
48  if data and isinstance(data, str) and pat.match(data) and len(data) == 32: pid = data
49  else: pid = None
50  time.sleep(sleep)
51  if sleep < wtime: sleep *= 2
52  elif sleep == wtime: sleep = iwtime # start new cycle
53  else: sleep = wtime
54  if (time.time()-time0) > 300:
55  return {"status":"fail", "reason":("client timeout after %s sec" % int(time.time()-time0))}
56  jsondict = json.loads(data)
57  if ('status' not in jsondict) or jsondict['status'] != 'ok':
58  print('DAS record with bad status or without status field:\n')
59  pprint.pprint(jsondict)
60  sys.exit()
61 
62  return jsondict
63 
64 def findKeyValue(data, key):
65  value = 0
66  if isinstance(data, list):
67  found_key = False
68  for i in range(0, len(data)):
69  if key in data[i].keys():
70  if found_key and value!=data[i][key]:
71  pprint.pprint(data)
72  print "ERROR: Found multiple instances of key \'%s\'." % key
73  sys.exit(0)
74  else:
75  found_key = True
76  value = data[i][key]
77  if not found_key:
78  # pprint.pprint(data)
79  print "WARNING: Returning NULL. Could not find key \'%s\' in list." % key
80  value = "NULL"
81  # sys.exit(0)
82  elif isinstance(data, dict):
83  if key in data.keys():
84  value = data[key]
85  else:
86  # pprint.pprint(data)
87  print "WARNING: Returning NULL. Could not find key \'%s\' in dict." % key
88  value = "NULL"
89  # sys.exit(0)
90  else:
91  # pprint.pprint(data)
92  print "WARNING: Returning NULL. Dictionary is of neither type list or dict:"
93  value = "NULL"
94  return value
95 
96 def getFileRunInfo(file, getlumis = False, verbose = False):
97  jsondict = get_data('lumi file='+file)
98  data = jsondict['data']
99 
100  # when looking at data (not mc), we expect a list of dictionaries
101  # because a number of runs got combined into one if it was not prompt reco
102  rundict = {}
103  for idata in data:
104  orig_lumidict = findKeyValue(idata, 'lumi')
105  run_number = findKeyValue(orig_lumidict,'run_number')
106  if (run_number=="NULL"): run_number = -1 #expecting an int
107  if (verbose): print "Found run %s in file %s." % (run_number, file)
108  rundict[run_number] = []
109  lumis = findKeyValue(orig_lumidict, 'number')
110  if (lumis=="NULL"):
111  rundict[run_number].append(-1)
112  else:
113  for ll in lumis:
114  rundict[run_number].extend([i for i in range(ll[0],ll[1]+1)])
115 
116  if verbose:
117  print "Contents of %s" % file
118  pprint.pprint(rundict)
119  if (getlumis):
120  return rundict
121  else:
122  return rundict.keys()
123 
124 def getFilesInfo(dataset, wanted_keys = ['name','size','nevents'], verbose = False):
125  jsondict = get_data('file dataset='+dataset)
126  # pprint.pprint(jsondict)
127  fdicts = []
128  for entry in jsondict['data']:
129  orig_fdict = entry['file']
130  # if (orig_fdict['name'] = '/store/data/Run2015D/HTMHT/MINIAOD/PromptReco-v4/000/258/706/00000/90EEB778-5271-E511-B309-02163E014364.root'):
131  # print "asdgdfgadfgafdg"
132  skim_fdict = {}
133  for key in wanted_keys:
134  skim_fdict[key] = findKeyValue(orig_fdict, key)
135  if (key=='size' or key=='nevents') and skim_fdict[key]=="NULL": skim_fdict[key] = -1
136  if verbose: print skim_fdict[key],
137  if verbose: print
138  fdicts.append(skim_fdict)
139 
140  return fdicts
141 
142 def getDatasetInfo(dataset, wanted_keys = ['name','size','nevents','nfiles'], verbose = False):
143  jsondict_ds = get_data('dataset='+dataset)
144  # pprint.pprint(jsondict_ds)
145  # what file attributes do we want to keep track of
146  orig_dsdict = findKeyValue(jsondict_ds['data'],'dataset')
147  skim_dsdict = {}
148  for key in wanted_keys:
149  skim_dsdict[key] = findKeyValue(orig_dsdict, key)
150  if verbose: print skim_dsdict[key],
151  if verbose: print
152 
153  return skim_dsdict
154 
155 # Use a wildcard to retrieve info for multiple datasets
156 def getDatasetsInfo(dataset, wanted_keys = ['name','size','nevents','nfiles'], verbose = False):
157  dsdicts = []
158  if '*' in dataset:
159  jsondict = get_data('dataset='+dataset)
160  # if using a wildcard, I need to get all the names first,
161  # because if I query for multiple datasets at a time it returns only minimal info for each
162  for entry in jsondict['data']:
163  ds_entry = findKeyValue(entry,'dataset')
164  dsname = findKeyValue(ds_entry,'name')
165  skim_dsdict = getDatasetInfo(dsname, wanted_keys = wanted_keys, verbose = verbose)
166  dsdicts.append(skim_dsdict)
167  else:
168  skim_dsdict = getDatasetInfo(dataset, wanted_keys = wanted_keys, verbose = verbose)
169  dsdicts.append(skim_dsdict)
170 
171  return dsdicts
172 
173 # test
174 # answer = getDatasetsInfo("/ggZH_HToBB_ZToNuNu_M125_13TeV_powheg_*/RunIISpring15DR74*/MINIAODSIM", verbose = True)
175 # answer = getDatasetsInfo("/ttHJetTobb_M125_13TeV_amcatnloFXFX_madspin_pythia8/RunIISpring15DR74-Asympt25ns_MCRUN2_74_V9_ext3-v1/MINIAODSIM", verbose = True)
176 # answer = getFilesInfo("/HTMHT/Run2015D-PromptReco-v3/MINIAOD", verbose = True)
177 # answer = getFilesInfo("/HTMHT/Run2015D-PromptReco-v3/MINIAOD", verbose = True)
178 # answer = getFileRunInfo("/store/data/Run2015D/HTMHT/MINIAOD/05Oct2015-v1/10000/0A668427-B06F-E511-813E-0025904B12A8.root", lumis = True, verbose = True)
179 
180 # answer = getDatasetsInfo("/HTMHT/Run2015D*/MINIAOD", verbose = True)
181 # answer = getDatasetsInfo("/MET/Run2015D*/MINIAOD", verbose = True)
182 # answer = getDatasetsInfo("/SingleElectron/Run2015D*/MINIAOD", verbose = True)
183 # answer = getDatasetsInfo("/SingleMuon/Run2015D*/MINIAOD", verbose = True)
184 # answer = getDatasetsInfo("/JetHT/Run2015D*/MINIAOD", verbose = True)
185 # answer = getDatasetsInfo("/DoubleEG/Run2015D*/MINIAOD", verbose = True)
186 # answer = getDatasetsInfo("/DoubleMuon/Run2015D*/MINIAOD", verbose = True)
def getFilesInfo(dataset, wanted_keys=['name', size, nevents, verbose=False)
Definition: das_client.py:124
def get_data(query)
Definition: das_client.py:11
def getDatasetsInfo(dataset, wanted_keys=['name', size, nevents, nfiles, verbose=False)
Definition: das_client.py:156
def getDatasetInfo(dataset, wanted_keys=['name', size, nevents, nfiles, verbose=False)
Definition: das_client.py:142
def findKeyValue(data, key)
Definition: das_client.py:64
def getFileRunInfo(file, getlumis=False, verbose=False)
Definition: das_client.py:96