ra4_stats  0341147a0dc35f80f4e12c6003afb76a38e2ed6e
bin_significance.py
Go to the documentation of this file.
1 #! /usr/bin/env python
2 
3 from __future__ import print_function
4 
5 import argparse
6 import os
7 import math
8 
9 import scipy.stats
10 
11 import ROOT
12 
13 import utils
14 
16  funcs = w.allFunctions()
17  iterator = funcs.createIterator()
18  func = iterator.Next()
19  names = ["Systematics"]
20  while func:
21  name = func.GetName()
22  if name.startswith("nbkg_BLK_"):
23  names.append(name[9:])
24  func = iterator.Next()
25  iterator.Reset()
26  return names
27 
29  funcs = w.allFunctions()
30  iterator = funcs.createIterator()
31  func = iterator.Next()
32  names = []
33  while func:
34  name = func.GetName()
35  if name.startswith("nmc_BLK_") and "_PRC_" in name:
36  pos = name.find("_PRC_") + 5
37  names.append(name[pos:])
38  func = iterator.Next()
39  iterator.Reset()
40  return set(names)
41 
43  pdfs = w.allPdfs()
44  iterator = pdfs.createIterator()
45  pdf = iterator.Next()
46  names = []
47  while pdf:
48  name = pdf.GetName()
49  if name.startswith("constraint_"):
50  names.append(name[11:])
51  pdf = iterator.Next()
52  iterator.Reset()
53  return names
54 
55 def find_sat_nlls(bins, procs, w):
56  for b in bins:
57  name = b[0]
58  b[3] = 0.
59  if name == "Systematics": continue
60  nobs = w.var("nobs_BLK_"+name).getVal()
61  if nobs > 0:
62  b[3] -= nobs*math.log(nobs) - nobs - math.lgamma(nobs+1.)
63  for p in procs:
64  nobs = w.var("nobsmc_BLK_{}_PRC_{}".format(name, p)).getVal()
65  if nobs > 0:
66  b[3] -= nobs*math.log(nobs) - nobs - math.lgamma(nobs+1.)
67 
68 def set_variables(w, f):
69  pars = f.floatParsFinal()
70  set_r = False
71  for i in range(pars.getSize()):
72  par = pars[i]
73  var = w.var(par.GetName())
74  var.removeRange()
75  var.setVal(par.getVal())
76  var.setError(par.getError())
77  if par.GetName() == "r":
78  set_r = True
79  r = w.var("r")
80  if not set_r:
81  r.setVal(0.)
82  r.setConstant(True)
83  else:
84  r.setConstant(False)
85 
86 def find_bkg_nlls(bins, procs, constraints, w, fit_b):
87  set_variables(w, fit_b)
88  for b in bins:
89  name = b[0]
90  b[1] = 0.
91  if name == "Systematics":
92  for c in constraints:
93  b[1] -= w.pdf("constraint_{}".format(c)).getLogVal(None)
94  continue
95 
96  b[1] -= w.pdf("pdf_null_BLK_{}".format(name)).getLogVal()
97  for p in procs:
98  b[1] -= w.pdf("pdf_mc_BLK_{}_PRC_{}".format(name, p)).getLogVal()
99 
100 def find_sig_nlls(bins, procs, constraints, w, fit_s):
101  set_variables(w, fit_s)
102  for b in bins:
103  name = b[0]
104  b[2] = 0.
105  if name == "Systematics":
106  for c in constraints:
107  b[2] -= w.pdf("constraint_{}".format(c)).getLogVal(None)
108  continue
109 
110  b[2] -= w.pdf("pdf_alt_BLK_{}".format(name)).getLogVal()
111  for p in procs:
112  b[2] -= w.pdf("pdf_mc_BLK_{}_PRC_{}".format(name, p)).getLogVal()
113 
114 
115 def pretty_name(name):
116  if "_BIN_" in name:
117  pos = name.find("_BIN_")+5
118  name = name[pos:]
119 
120  name = name.replace("r1_","R1: ")
121  name = name.replace("r2_","R2: ")
122  name = name.replace("r3_","R3: ")
123  name = name.replace("r4_","R4: ")
124 
125  name = name.replace("lowmet", "200<MET#leq 350")
126  name = name.replace("medmet", "350<MET#leq 500")
127  name = name.replace("highmet", "MET>500")
128 
129  name = name.replace("_lownj", ", 6#leq N_{jets}#leq 8")
130  name = name.replace("_highnj", ", N_{jets}#geq 9")
131 
132  name = name.replace("_allnb", "")
133  name = name.replace("_1b", ", N_{b}=1")
134  name = name.replace("_2b", ", N_{b}=2")
135  name = name.replace("_3b", ", N_{b}#geq 3")
136 
137  return name
138 
139 def get_threshold(n_sigma, n_bins, n_dof):
140  norm_cdf = scipy.stats.norm.cdf(n_sigma)
141  chi2_ppf = scipy.stats.chi2.ppf(norm_cdf, n_dof)
142  return chi2_ppf/n_bins
143 
144 def style(h, width, style, color):
145  h.SetLineWidth(width)
146  h.SetLineStyle(style)
147  h.SetLineColor(color)
148 
149 def make_plot(out_dir, input_name, bins, ndof, r_sign, bkg_idx, sig_idx):
150  if sig_idx == 2:
151  ndof = 1
152  elif sig_idx == 3:
153  r_sign = 0
154 
155  chisq = 0.
156  for i in range(len(bins)):
157  chisq += 2.*(bins[i][bkg_idx]-bins[i][sig_idx])
158  p = scipy.stats.chi2.sf(chisq,ndof)
159  if r_sign < 0:
160  p = 1.-0.5*p
161  elif r_sign > 0:
162  p = 0.5*p
163  z = scipy.stats.norm.isf(p)
164 
165  out_name = "{}_chisq_{}_model.pdf".format(input_name,
166  "signal" if sig_idx == 2 else "saturated")
167  out_path = os.path.join(out_dir, out_name)
168 
169  h_name = "Background-Only vs {} Model (#chi^{{2}}={:.2f}, N.D.o.F.={:d}, p={:.3f}, Z={:.2f});;#chi^{{2}}"
170  h_name = h_name.format("Signal" if sig_idx == 2 else "Saturated", chisq, ndof, p, z)
171 
172  h = ROOT.TH1D("", h_name, len(bins), -0.5, len(bins)-0.5)
173  h.SetStats(False)
174  h0 = ROOT.TH1D("", h_name, len(bins), -0.5, len(bins)-0.5)
175 
176  style(h, 4, 1, ROOT.kBlack)
177  style(h0, 1, 2, ROOT.kBlack)
178 
179  y0 = get_threshold(0., len(bins), ndof)
180 
181  for i in range(len(bins)):
182  h.GetXaxis().SetBinLabel(i+1, pretty_name(bins[i][0]))
183  chisq = 2.*(bins[i][bkg_idx]-bins[i][sig_idx])
184  h.SetBinContent(i+1,chisq)
185  h0.SetBinContent(i+1, y0)
186 
187  h.LabelsOption("v","x")
188 
189  c = ROOT.TCanvas()
190  c.cd()
191  c.SetMargin(0.1, 0.05, 0.4, 0.1)
192  h.Draw("hist")
193  h0.Draw("hist same")
194  c.Print(out_path)
195 
196 def bin_significance(out_dir, input_path, ndof):
197  input_path = utils.full_path(input_path)
198  out_dir = utils.full_path(out_dir)
199 
200  input_name = os.path.splitext(os.path.basename(input_path))[0]
201 
202  with utils.ROOTFile(input_path, "read") as f:
203  w = f.Get("w")
204  fit_b = f.Get("fit_b")
205  fit_s = f.Get("fit_s")
206  bins = [ [b, 0., 0., 0.] for b in get_bin_names(w) ]
207  procs = get_process_names(w)
208  constraints = get_constraint_names(w)
209  r_sign = 0
210  r = fit_s.floatParsFinal().find("r")
211  if r.getVal() < 0.:
212  r_sign = -1
213  elif r.getVal() > 0.:
214  r_sign = 1
215 
216  find_sat_nlls(bins, procs, w)
217  find_bkg_nlls(bins, procs, constraints, w, fit_b)
218  find_sig_nlls(bins, procs, constraints, w, fit_s)
219 
220  print("{:>48s}: {:>12s} {:>12s} {:>12s}".format("Bin Name", "Bkg NLL", "Sig NLL", "Sat NLL"))
221  for b in bins:
222  print("{:>48s}: {:>12.3f} {:>12.3f} {:>12.3f}".format(b[0],b[1],b[2],b[3]))
223 
224  make_plot(out_dir, input_name, bins, ndof, r_sign, 1, 2)
225  make_plot(out_dir, input_name, bins, ndof, 0, 1, 3)
226 
227 if __name__ == "__main__":
228  parser = argparse.ArgumentParser(description="Finds approximate significance contribution from each bin",
229  formatter_class=argparse.ArgumentDefaultsHelpFormatter)
230  parser.add_argument("in_file", help="File to analyze, containing global workspace and cached fit results")
231  parser.add_argument("out_dir", default=".", nargs="?", help="Directory in which to place results")
232  parser.add_argument("--ndof", type=int, default=18, help="Degrees of freedom added by saturated model compared to background-only model")
233 
234  args = parser.parse_args()
235 
236  bin_significance(args.out_dir, args.in_file, args.ndof)
def find_sat_nlls(bins, procs, w)
def full_path(path)
Definition: utils.py:7
def find_bkg_nlls(bins, procs, constraints, w, fit_b)
def get_threshold(n_sigma, n_bins, n_dof)
def make_plot(out_dir, input_name, bins, ndof, r_sign, bkg_idx, sig_idx)
def style(h, width, style, color)
def bin_significance(out_dir, input_path, ndof)
def set_variables(w, f)
def find_sig_nlls(bins, procs, constraints, w, fit_s)