ra4_macros  bede988c286599a3a84b77a4d788ac0a971e89f9
plot_mj_bdt_aux.cxx
Go to the documentation of this file.
1 // plot_mj_bdt_aux: trains various BDTs which combine MJ, njets, and HT. Plots ROCs with them
2 // If run without parameters, it plots the ROCs (it requires the root files with the BDTs)
3 // If run with a parameter, it does the BDT training
4 // If run with two parameters, it doesn the BDT training and adds the 2nd parameter to the folder name
5 
6 #include <stdexcept>
7 #include <iostream>
8 #include <vector>
9 #include <utility>
10 
11 #include "TFile.h"
12 #include "TCanvas.h"
13 #include "TMarker.h"
14 #include "TString.h"
15 #include "TChain.h"
16 #include "TLegend.h"
17 #include "TCut.h"
18 #include "TSystem.h"
19 #include "TDirectory.h"
20 #include "TMVA/Tools.h"
21 #include "TMVA/Factory.h"
22 
23 #include "plot_mj_bdt_aux.hpp"
24 #include "utilities.hpp"
25 #include "utilities_macros.hpp"
26 #include "styles.hpp"
27 
28 using namespace std;
29 
30 // If run without parameters, it plots the ROCs (it requires the root files with the BDTs)
31 // If run with a parameter, it does the BDT training
32 // If run with two parameters, it doesn the BDT training and adds the 2nd parameter to the folder name
33 int main(int argc, char *argv[]){
34  TString tag("");
35  if(argc > 2) tag = argv[2];
36 
37  TString bfolder("");
38  string hostname = execute("echo $HOSTNAME");
39  if(Contains(hostname, "cms") || Contains(hostname, "compute-"))
40  bfolder = "/net/cms2"; // In laptops, you can't create a /net folder
41 
42  TString ntufolder=bfolder+"/cms2r0/babymaker/babies/2015_01_30/small_tree/skim_ht500met200/";
43  TString bdtfolder=bfolder+"/cms2r0/babymaker/babies/2015_01_30/small_tree/bdt"+tag+"/";
44 
45  if(argc > 1) trainBDT(ntufolder, bdtfolder);
46  else plotROC(bdtfolder);
47 }
48 
49 
50 void plotROC(TString bdtfolder){
51 
53  TString lsp = "{#lower[-0.1]{#tilde{#chi}}#lower[0.2]{#scale[0.85]{^{0}}}#kern[-1.3]{#scale[0.85]{_{1}}}}";
54  TString t1t_label = "#tilde{g}#kern[0.2]{#tilde{g}}, #tilde{g}#rightarrowt#kern[0.18]{#bar{t}}#kern[0.18]"+lsp;
55  vector<TString> v_t1t_tt_htmj_30;
56  v_t1t_tt_htmj_30.push_back(bdtfolder+"bdt_T1tttt1500_TTJet_htmj_30.root");
57  vector<sample_class> t1t_tt_htmj_30;
58  t1t_tt_htmj_30.push_back(sample_class(t1t_label+" (1500,100)", v_t1t_tt_htmj_30));
59  t1t_tt_htmj_30.push_back(sample_class("t#bar{t}", v_t1t_tt_htmj_30));
60 
61  vector<TString> v_t1t_tt_htnjets30;
62  v_t1t_tt_htnjets30.push_back(bdtfolder+"bdt_T1tttt1500_TTJet_htnjets30.root");
63  vector<sample_class> t1t_tt_htnjets30;
64  t1t_tt_htnjets30.push_back(sample_class(t1t_label+" (1500,100)", v_t1t_tt_htnjets30));
65  t1t_tt_htnjets30.push_back(sample_class("t#bar{t}", v_t1t_tt_htnjets30));
66 
67  vector<TString> v_t1t_tt_njets30mj_30;
68  v_t1t_tt_njets30mj_30.push_back(bdtfolder+"bdt_T1tttt1500_TTJet_njets30mj_30.root");
69  vector<sample_class> t1t_tt_njets30mj_30;
70  t1t_tt_njets30mj_30.push_back(sample_class(t1t_label+" (1500,100)", v_t1t_tt_njets30mj_30));
71  t1t_tt_njets30mj_30.push_back(sample_class("t#bar{t}", v_t1t_tt_njets30mj_30));
72 
73  // T1tttt(1200,800)
74  vector<TString> v_t1tc_tt_htmj_30;
75  v_t1tc_tt_htmj_30.push_back(bdtfolder+"bdt_T1tttt1200_TTJet_htmj_30.root");
76  vector<sample_class> t1tc_tt_htmj_30;
77  t1tc_tt_htmj_30.push_back(sample_class(t1t_label+" (1200,800)", v_t1tc_tt_htmj_30));
78  t1tc_tt_htmj_30.push_back(sample_class("t#bar{t}", v_t1tc_tt_htmj_30));
79 
80  vector<TString> v_t1tc_tt_htnjets30;
81  v_t1tc_tt_htnjets30.push_back(bdtfolder+"bdt_T1tttt1200_TTJet_htnjets30.root");
82  vector<sample_class> t1tc_tt_htnjets30;
83  t1tc_tt_htnjets30.push_back(sample_class(t1t_label+" (1200,800)", v_t1tc_tt_htnjets30));
84  t1tc_tt_htnjets30.push_back(sample_class("t#bar{t}", v_t1tc_tt_htnjets30));
85 
86  vector<TString> v_t1tc_tt_njets30mj_30;
87  v_t1tc_tt_njets30mj_30.push_back(bdtfolder+"bdt_T1tttt1200_TTJet_njets30mj_30.root");
88  vector<sample_class> t1tc_tt_njets30mj_30;
89  t1tc_tt_njets30mj_30.push_back(sample_class(t1t_label+" (1200,800)", v_t1tc_tt_njets30mj_30));
90  t1tc_tt_njets30mj_30.push_back(sample_class("t#bar{t}", v_t1tc_tt_njets30mj_30));
91 
93  int ht_col(kRed+1), mj_col(kAzure+2), col2(kGreen+1), col3(1), col4(kOrange+7);
94  int mj_style(8); float mj_size(2.);
95  vector<marker_class> mj_points, ht_points, nj_points;
96  mj_points.push_back(marker_class(200, mj_size, mj_col, mj_style));
97  mj_points.push_back(marker_class(400, 4, mj_col, 29));
98  mj_points.push_back(marker_class(600, mj_size, mj_col, mj_style));
99  mj_points.push_back(marker_class(800, mj_size, mj_col, mj_style));
100 
101  ht_points.push_back(marker_class(1000, 4, ht_col, 29));
102  ht_points.push_back(marker_class(1500, mj_size, ht_col, mj_style));
103  ht_points.push_back(marker_class(2000, mj_size, ht_col, mj_style));
104 
105  nj_points.push_back(marker_class(6, mj_size, col4, mj_style));
106  nj_points.push_back(marker_class(9, 4, col4, 29));
107  nj_points.push_back(marker_class(11, mj_size, col4, mj_style));
108 
109 
111  TString mj_s("M#lower[-.1]{_{J}}"), nj_s("N#lower[-.1]{_{jets}}"), ht_s("H#lower[-.1]{_{T}}");
112  vector<var_class> mj_t1t_tt;
113  mj_t1t_tt.push_back(var_class(t1t_tt_njets30mj_30, "BDT",0.32, -0.5,"BDT["+mj_s+", "+nj_s+"]",col3,7));
114  mj_t1t_tt.push_back(var_class(t1t_tt_htnjets30, "BDT",0.3, -0.5,"BDT["+ht_s+", "+nj_s+"]",col2,2));
115  mj_t1t_tt.push_back(var_class(t1t_tt_htmj_30, "BDT",0.23, -0.5,"BDT["+ht_s+", "+mj_s+"]",kMagenta+1,7));
116  mj_t1t_tt.push_back(var_class(t1t_tt_htmj_30, "ht",4000,0,""+ht_s+"",ht_col,1,ht_points));
117  mj_t1t_tt.push_back(var_class(t1t_tt_htmj_30, "njets30",15,0,""+nj_s+"",col4,1,nj_points));
118  mj_t1t_tt.push_back(var_class(t1t_tt_htmj_30, "mj_30",2000,0,""+mj_s+"",mj_col,1,mj_points));
119 
120  vector<var_class> mj_t1tc_tt;
121  mj_t1tc_tt.push_back(var_class(t1tc_tt_njets30mj_30, "BDT",0.24, -0.5,"BDT["+mj_s+", "+nj_s+"]",col3,7));
122  mj_t1tc_tt.push_back(var_class(t1tc_tt_htnjets30, "BDT",0.24, -0.5,"BDT["+ht_s+", "+nj_s+"]",col2,2));
123  mj_t1tc_tt.push_back(var_class(t1tc_tt_htmj_30, "BDT",0.23, -0.5,"BDT["+ht_s+", "+mj_s+"]",kMagenta+1,7));
124  mj_t1tc_tt.push_back(var_class(t1tc_tt_htmj_30, "ht",4000,0,""+ht_s+"",ht_col,1,ht_points));
125  mj_t1tc_tt.push_back(var_class(t1tc_tt_htmj_30, "njets30",15,0,""+nj_s+"",col4,1,nj_points));
126  mj_t1tc_tt.push_back(var_class(t1tc_tt_htmj_30, "mj_30",2000,0,""+mj_s+"",mj_col,1,mj_points));
127 
128  vector<TString> vs_sam;
129  vector<vector<var_class> > v_mj;
130  v_mj.push_back(mj_t1t_tt); vs_sam.push_back("t1t_tt");
131  v_mj.push_back(mj_t1tc_tt); vs_sam.push_back("t1tc_tt");
132 
133  vector<TString> v_cuts;
134  v_cuts.push_back("ht>500&&met>200");
135 
136  for(unsigned icut(0); icut<v_cuts.size(); icut++){
137  for(unsigned isam(0); isam<vs_sam.size(); isam++){
138  DrawROC(v_mj[isam], v_cuts[icut], "mj_bdt_"+vs_sam[isam]);
139  } // Loop over samples
140  } // Loop over cuts
141 
142 }
143 
144 
145 void trainBDT(TString ntufolder, TString bdtfolder){
146 
147  TString nTrain("20000"), nTest("0");
148  gSystem->mkdir(bdtfolder, kTRUE);
149 
150  vector<bdtvar_class> v_htmj;
151  v_htmj.push_back(bdtvar_class("ht","H_{T}",'F'));
152  v_htmj.push_back(bdtvar_class("mj_30","M_{J}",'F'));
153 
154  vector<bdtvar_class> v_htnjets;
155  v_htnjets.push_back(bdtvar_class("ht","H_{T}",'F'));
156  v_htnjets.push_back(bdtvar_class("njets30","N_{jets}",'I'));
157 
158  vector<bdtvar_class> v_njetsmj;
159  v_njetsmj.push_back(bdtvar_class("njets30","N_{jets}",'I'));
160  v_njetsmj.push_back(bdtvar_class("mj_30","M_{J}",'F'));
161 
162  vector<bdtvar_class> v_njetsmjhtmet;
163  v_njetsmjhtmet.push_back(bdtvar_class("mj_30","M_{J}",'F'));
164  v_njetsmjhtmet.push_back(bdtvar_class("ht","H_{T}",'F'));
165  v_njetsmjhtmet.push_back(bdtvar_class("njets30","N_{jets}",'I'));
166  v_njetsmjhtmet.push_back(bdtvar_class("met","MET",'F'));
167 
168  vector<TString> v_signal;
169  v_signal.push_back("*T1tttt*1500*");
170  v_signal.push_back("*T1tttt*1200*");
171 
172  vector<TString> v_bkg;
173  v_bkg.push_back("*TTJet*");
174 
175  vector<bdt_class> bdts;
176  for(unsigned isig(0); isig < v_signal.size(); isig++){
177  for(unsigned ibkg(0); ibkg < v_bkg.size(); ibkg++){
178  bdts.push_back(bdt_class(v_htmj, v_signal[isig], v_bkg[ibkg]));
179  bdts.push_back(bdt_class(v_htnjets, v_signal[isig], v_bkg[ibkg]));
180  bdts.push_back(bdt_class(v_njetsmj, v_signal[isig], v_bkg[ibkg]));
181  //bdts.push_back(bdt_class(v_njetsmjhtmet, v_signal[isig], v_bkg[ibkg]));
182  }
183  }
184 
185  for(unsigned ibdt(0); ibdt < bdts.size(); ibdt++){
186  // Loading root file and trees
187  TFile tmvaFile(bdtfolder+bdts[ibdt].name+".root", "RECREATE");
188  tmvaFile.cd();
189  TChain signal("tree"), ttbar("tree");
190  signal.Add(ntufolder+bdts[ibdt].signal);
191  ttbar.Add(ntufolder+bdts[ibdt].bkg);
192 
193  // Creating TMVA factory
194  TMVA::Factory *factory = new TMVA::Factory(bdts[ibdt].name, &tmvaFile,"!V:Silent:Color");
195  for(unsigned ind(0); ind < bdts[ibdt].vars.size(); ind++)
196  factory->AddVariable(bdts[ibdt].vars[ind].variable, bdts[ibdt].vars[ind].name,
197  bdts[ibdt].vars[ind].unit, bdts[ibdt].vars[ind].type);
198  factory->AddSpectator("met := met", "MET", "GeV", 'F');
199  factory->AddSpectator("ht := ht", "H_{T}", "GeV", 'F');
200  factory->AddSpectator("mj_30 := mj_30", "M_{J}", "GeV", 'F');
201  factory->AddSpectator("nvmus10 := nvmus10", "Number of veto muons", "", 'I');
202  factory->AddSpectator("nvels10 := nvels10", "Number of veto electrons", "", 'I');
203  factory->AddSpectator("nmus := nmus", "Number of muons", "", 'I');
204  factory->AddSpectator("nels := nels", "Number of electrons", "", 'I');
205  factory->AddSpectator("njets30 := njets30", "Number of 30 GeV jets", "", 'I');
206  factory->AddSignalTree (&signal, 1.);
207  factory->AddBackgroundTree(&ttbar, 1.);
208  factory->SetSignalWeightExpression ("weight");
209  factory->SetBackgroundWeightExpression("weight");
210 
211  TCut mycuts(""), mycutb("");
212  factory->PrepareTrainingAndTestTree( mycuts, mycutb, "nTrain_Signal="+nTrain+":nTrain_Background="+nTrain+
213  ":nTest_Signal="+nTest+":nTest_Background="+nTest+
214  ":SplitMode=Random:NormMode=NumEvents:!V" );
215  factory->BookMethod( TMVA::Types::kBDT, "BDT",
216  "!H:!V:NTrees=850:MinNodeSize=2.5%:MaxDepth=3:BoostType=AdaBoost:AdaBoostBeta=0.5:UseBaggedBoost:BaggedSampleFraction=0.5:SeparationType=GiniIndex:nCuts=20" );
217 
218  factory->TrainAllMethods(); // Train MVAs using the set of training events
219  factory->TestAllMethods(); // Evaluate all MVAs using the set of test events
220  factory->EvaluateAllMethods(); // Evaluate and compare performance of all configured MVAs
221 
222  delete factory;
223  tmvaFile.Write();
224  tmvaFile.Close();
225  cout<<"\t\t\t\t\t\t\t\t\t\t\tWritten "<<bdtfolder+bdts[ibdt].name+".root"<<endl;
226 
227  } // Loop over BDTs
228 
229 }
230 
231 
232 void DrawROC(vector<var_class> vars, TString cuts, TString tag){
233  styles style("RA4"); style.setDefaultStyle();
234  TCanvas can;
235  const int nbins(1000);
236  vector<vector<TH1D> > histos;
237  TString hname, totcut;
238  TChain *chain[2];
239 
240  for(unsigned var(0); var<vars.size(); var++){
241  vector<sample_class> samples = vars[var].samples;
242  for(unsigned sam(0); sam < samples.size(); sam++){
243  // Loading chains
244  for(unsigned isam(0); isam < samples[sam].files.size(); isam++){
245  chain[sam] = new TChain("TestTree");
246  int nfiles = chain[sam]->Add(samples[sam].files[isam]);
247  if(nfiles==0) cout<<samples[sam].files[isam]<<" not found"<<endl;
248  }
249  histos.push_back(vector<TH1D>());
250 
251  // Projecting variables
252  float minh(vars[var].minx), maxh(vars[var].maxx);
253  if(minh > maxh){
254  minh = maxh;
255  maxh = vars[var].minx;
256  }
257  hname = "histo"; hname += sam; hname += var;
258  totcut = "weight*("+cuts+"&&"+samples[sam].cut+"&&classID==";
259  totcut += sam; totcut += ")";
260  histos[sam].push_back(TH1D(hname,"",nbins,minh,maxh));
261  chain[sam]->Project(hname, vars[var].varname, totcut);
262  chain[sam]->Delete();
263  } // Loop over variables
264  } // Loop over samples
265 
266  TH1D base_histo("base","",1,0.01,1.0);
267  base_histo.SetMinimum(0.0);
268  base_histo.SetMaximum(1.0);
269  base_histo.SetDirectory(0);
270  base_histo.Draw();
271  style.setTitles(&base_histo, vars[0].samples[0].label+" efficiency", vars[0].samples[1].label+" efficiency",
272  "#scale[0.8]{#font[62]{CMS}} #scale[0.6]{#font[52]{Supplementary (Simulation)}}", "#scale[0.8]{13 TeV}");
273 
274  // Legend
275  double legX = style.PadLeftMargin+0.03, legY = 1-style.PadTopMargin-0.02, legSingle = 0.06;
276  double legW = 0.2, legH = legSingle*vars.size();
277  TLegend leg(legX, legY-legH, legX+legW, legY);
278  leg.SetTextSize(0.054); leg.SetFillColor(0); leg.SetFillStyle(0); leg.SetBorderSize(0);
279  leg.SetTextFont(132);
280 
281  // Making individual graphs
282  TGraph graphs[100]; // Had to make it an array because a vector<TGraph> kept crashing
283  for(unsigned var(0); var<vars.size(); var++){
284  graphs[var] = MakeROC(histos[0][var], histos[1][var], vars[var].minx < vars[var].maxx, vars[var].cuts);
285  graphs[var].SetLineColor(vars[var].color);
286  graphs[var].SetLineStyle(vars[var].style);
287  if(vars[var].style==1) graphs[var].SetLineWidth(5);
288  else graphs[var].SetLineWidth(8);
289  leg.AddEntry(&(graphs[var]), vars[var].title, "l");
290  graphs[var].Draw("lsame");
291  } // Loop over variables
292  leg.Draw();
293 
294  cuts.ReplaceAll(".","");
295  cuts.ReplaceAll("(",""); cuts.ReplaceAll("$",""); cuts.ReplaceAll(")","");
296  cuts.ReplaceAll("[",""); cuts.ReplaceAll("]","");
297  cuts.ReplaceAll("/","_"); cuts.ReplaceAll("*",""); cuts.ReplaceAll("&&","_");
298  cuts.ReplaceAll(">=","ge"); cuts.ReplaceAll("<=","se");
299  cuts.ReplaceAll(">","g"); cuts.ReplaceAll("<","s"); cuts.ReplaceAll("=","");
300  cuts.ReplaceAll("+","");
301  TString pname("plots/roc_"+tag+"_"+cuts+".pdf");
302  //can.Print(pname);
303  can.SetLogx(1);
304  can.SetLogy(1);
305  pname.ReplaceAll("roc_","log_roc_");
306  base_histo.SetMinimum(1e-4);
307  can.Print(pname);
308 
309 }
310 
311 TGraph MakeROC(TH1D &good, TH1D &bad, const bool less_is_better, vector<marker_class> cuts){
312  const int nbins = good.GetNbinsX();
313  if(bad.GetNbinsX() != nbins) throw logic_error("Mismatched number of bins");
314 
315  TMarker marker;
316 
317  TGraph graph(0);
318  const double good_tot = good.Integral(0, nbins+1);
319  const double bad_tot = bad.Integral(0, nbins+1);
320  int inibin(0), endbin(nbins+1), dbin(1); unsigned icut(0);
321  if(less_is_better){
322  inibin = nbins+1;
323  endbin = 0;
324  dbin = -1;
325  }
326  for(int bin = inibin; bin*dbin<=endbin*dbin; bin+=dbin){
327  const double good_pass = good.Integral(min(endbin,bin), max(endbin,bin));
328  const double bad_pass = bad.Integral(min(endbin,bin), max(endbin,bin));
329  const double x = good_pass/good_tot;
330  const double y = bad_pass/bad_tot;
331  graph.SetPoint(graph.GetN(), x, y);
332 
333  // Plotting the stars
334  if(icut<cuts.size()){
335  float edge(good.GetXaxis()->GetBinUpEdge(bin));
336  if((edge>=cuts[icut].cut&&!less_is_better) || (edge<=cuts[icut].cut&&less_is_better)){
337  marker.SetMarkerStyle(cuts[icut].style);marker.SetMarkerColor(cuts[icut].color);
338  marker.SetMarkerSize(cuts[icut].size);
339  if(x>0.01 && y>0.0001) marker.DrawMarker(x,y);
340  icut++;
341  }
342  }
343  }
344  TString name(good.GetName());
345  name += "graph";
346  graph.SetName(name);
347  graph.SetTitle(name);
348 
349  return graph;
350 }
351 
352 var_class::var_class(vector<sample_class> isamples, TString ivarname, float iminx, float imaxx, TString ititle,
353  int icolor, int istyle, vector<marker_class> icuts){
354  varname = ivarname; minx = iminx; maxx = imaxx; title = ititle;
355  cuts = icuts;
356  color = icolor; style = istyle;
357  samples = isamples;
358 }
359 
360 sample_class::sample_class(TString ilabel, vector<TString> ifiles, TString icut){
361  files = ifiles; label = ilabel; cut = icut;
362 }
363 
364 marker_class::marker_class(float icut, float isize, int icolor, int istyle){
365  cut=icut; size=isize; color=icolor; style=istyle;
366 }
367 
368 
369 
370 bdt_class::bdt_class(vector<bdtvar_class> ivars, TString isignal, TString ibkg):
371  vars(ivars),
372  signal(isignal),
373  bkg(ibkg){
374  name = "bdt_"+signal+"_"+bkg+"_";
375  for(unsigned ind(0); ind < vars.size(); ind++) name += vars[ind].variable;
376  name.ReplaceAll("*","");
377 }
378 
379 bdtvar_class::bdtvar_class(TString ivariable, TString iname, char itype, TString iunit):
380  variable(ivariable),
381  name(iname),
382  type(itype),
383  unit(iunit){
384 }
385 
void trainBDT(TString ntufolder, TString bdtfolder)
bdt_class(std::vector< bdtvar_class > ivars, TString isignal, TString ibkg)
void setDefaultStyle()
Definition: styles.cpp:36
bool Contains(const std::string &text, const std::string &pattern)
TString signal
float PadTopMargin
Definition: styles.hpp:36
STL namespace.
std::string execute(const std::string &cmd)
int main(int argc, char *argv[])
void DrawROC(vector< var_class > vars, TString cuts, TString tag)
void setTitles(TH1 *h, TString xTitle="", TString yTitle="", TString Left="", TString Right="")
Definition: styles.cpp:173
bdtvar_class(TString variable, TString name, char type, TString unit="GeV")
float PadLeftMargin
Definition: styles.hpp:36
marker_class(float icut, float isize, int icolor, int istyle)
std::vector< bdtvar_class > vars
var_class(std::vector< sample_class > samples, TString ivarname, float iminx, float imaxx, TString ititle, int icolor, int istyle=1, std::vector< marker_class > icuts=std::vector< marker_class >())
TGraph MakeROC(TH1D &good, TH1D &bad, const bool less_is_better, vector< marker_class > cuts)
sample_class(TString ilabel, std::vector< TString > ifiles, TString icut="1")
void plotROC(TString bdtfolder)