diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..642be346 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "boostedhh"] + path = boostedhh + url = https://github.com/LPC-HH/boostedhh diff --git a/README.md b/README.md index fd0a3d04..d35ee33f 100644 --- a/README.md +++ b/README.md @@ -39,9 +39,6 @@ four beauty quarks (b). - [Jobs](#jobs) - [Dask](#dask) - [Postprocessing](#postprocessing) - - [Setup](#setup-1) - - [BDT Training](#bdt-training) - - [Creating templates / FOM Scan / BDT ROC curve](#creating-templates--fom-scan--bdt-roc-curve) - [Condor Scripts](#condor-scripts) - [Check jobs](#check-jobs) - [Combine pickles](#combine-pickles) @@ -74,7 +71,7 @@ micromamba activate hh4b ```bash # Clone the repository -git clone https://github.com/LPC-HH/HH4b.git +git clone -r https://github.com/LPC-HH/HH4b.git cd HH4b # Perform an editable installation pip install -e . @@ -83,6 +80,10 @@ pip install pre-commit pre-commit install # install requirements pip3 install -r requirements.txt +# install common boostedhh library +cd boostedhh +pip install -e . +cd .. ``` ### Troubleshooting diff --git a/boostedhh b/boostedhh new file mode 160000 index 00000000..74808dcb --- /dev/null +++ b/boostedhh @@ -0,0 +1 @@ +Subproject commit 74808dcbf7134f940b804935412ac8b2f4a9b83d diff --git a/src/HH4b/processors/SkimmerABC.py b/src/HH4b/processors/SkimmerABC.py deleted file mode 100644 index b7de57a2..00000000 --- a/src/HH4b/processors/SkimmerABC.py +++ /dev/null @@ -1,97 +0,0 @@ -""" -Skimmer Base Class - common functions for all skimmers. -Author(s): Raghav Kansal -""" - -from __future__ import annotations - -import logging -from abc import abstractmethod -from pathlib import Path - -import numpy as np -import pandas as pd -from coffea import processor - -from HH4b.hh_vars import LUMI - -from . import corrections - -logging.basicConfig(level=logging.INFO) - - -class SkimmerABC(processor.ProcessorABC): - """ - Skims nanoaod files, saving selected branches and events passing preselection cuts - (and triggers for data). - - Args: - xsecs (dict, optional): sample cross sections, - if sample not included no lumi and xsec will not be applied to weights - save_ak15 (bool, optional): save ak15 jets as well, for HVV candidate - """ - - XSECS = None - - def to_pandas(self, events: dict[str, np.array]): - """ - Convert our dictionary of numpy arrays into a pandas data frame - Uses multi-index columns for numpy arrays with >1 dimension - (e.g. FatJet arrays with two columns) - """ - return pd.concat( - # [pd.DataFrame(v.reshape(v.shape[0], -1)) for k, v in events.items()], - [pd.DataFrame(v) for k, v in events.items()], - axis=1, - keys=list(events.keys()), - ) - - def dump_table(self, pddf: pd.DataFrame, fname: str, odir_str: str = None) -> None: - """ - Saves pandas dataframe events to './outparquet' - """ - import pyarrow as pa - import pyarrow.parquet as pq - - local_dir = (Path() / "outparquet").resolve() - if odir_str: - local_dir += odir_str - local_dir.mkdir(parents=True, exist_ok=True) - - # need to write with pyarrow as pd.to_parquet doesn't support different types in - # multi-index column names - table = pa.Table.from_pandas(pddf) - if len(table) != 0: # skip dataframes with empty entries - pq.write_table(table, local_dir / fname) - - def pileup_cutoff(self, events, year, cutoff: float = 4): - pweights = corrections.get_pileup_weight(year, events.Pileup.nPU.to_numpy()) - pw_pass = ( - (pweights["nominal"] <= cutoff) - * (pweights["up"] <= cutoff) - * (pweights["down"] <= cutoff) - ) - logging.info(f"Passing pileup weight cut: {np.sum(pw_pass)} out of {len(events)} events") - events = events[pw_pass] - return events - - def get_dataset_norm(self, year, dataset): - """ - Cross section * luminosity normalization for a given dataset and year. - This still needs to be normalized with the acceptance of the pre-selection in post-processing. - (Done in postprocessing/utils.py:load_samples()) - """ - if dataset in self.XSECS: - xsec = self.XSECS[dataset] - weight_norm = xsec * LUMI[year] - else: - logging.warning("Weight not normalized to cross section") - weight_norm = 1 - - print("weight_norm", weight_norm) - - return weight_norm - - @abstractmethod - def add_weights(self) -> tuple[dict, dict]: - """Adds weights and variations, saves totals for all norm preserving weights and variations""" diff --git a/src/HH4b/processors/bbbbSkimmer.py b/src/HH4b/processors/bbbbSkimmer.py index e054e3d0..ebd899b9 100644 --- a/src/HH4b/processors/bbbbSkimmer.py +++ b/src/HH4b/processors/bbbbSkimmer.py @@ -16,13 +16,8 @@ import pandas as pd import vector import xgboost as xgb -from coffea import processor -from coffea.analysis_tools import PackedSelection, Weights - -import HH4b - -from . import objects, utils -from .corrections import ( +from boostedhh.processors import SkimmerABC, utils +from boostedhh.processors.corrections import ( JECs, add_pileup_weight, add_ps_weight, @@ -31,6 +26,19 @@ get_pdf_weights, get_scale_weights, ) +from boostedhh.processors.utils import ( + P4, + PAD_VAL, + add_selection, + get_var_mapping, + pad_val, +) +from coffea import processor +from coffea.analysis_tools import PackedSelection, Weights + +import HH4b + +from . import objects from .GenSelection import ( gen_selection_Hbb, gen_selection_HHbbbb, @@ -46,8 +54,6 @@ veto_electrons, veto_muons, ) -from .SkimmerABC import SkimmerABC -from .utils import P4, PAD_VAL, add_selection, get_var_mapping, pad_val # mapping samples to the appropriate function for doing gen-level selections gen_selection_dict = { diff --git a/src/HH4b/processors/corrections.py b/src/HH4b/processors/corrections.py deleted file mode 100644 index 36339b08..00000000 --- a/src/HH4b/processors/corrections.py +++ /dev/null @@ -1,413 +0,0 @@ -""" -Collection of utilities for corrections and systematics in processors. - -Loosely based on https://github.com/jennetd/hbb-coffea/blob/master/boostedhiggs/corrections.py - -Most corrections retrieved from the cms-nanoAOD repo: -See https://cms-nanoaod-integration.web.cern.ch/commonJSONSFs/ - -Authors: Raghav Kansal, Cristina Suarez -""" - -from __future__ import annotations - -import gzip -import pathlib -import pickle -import random - -import awkward as ak -import correctionlib -import numpy as np -import uproot -from coffea.analysis_tools import Weights -from coffea.nanoevents.methods import vector -from coffea.nanoevents.methods.base import NanoEventsArray -from coffea.nanoevents.methods.nanoaod import FatJetArray, JetArray - -from .utils import pad_val - -ak.behavior.update(vector.behavior) -package_path = str(pathlib.Path(__file__).parent.parent.resolve()) - -# Important Run3 start of Run -FirstRun_2022C = 355794 -FirstRun_2022D = 357487 -LastRun_2022D = 359021 -FirstRun_2022E = 359022 -LastRun_2022F = 362180 - -""" -CorrectionLib files are available from: /cvmfs/cms.cern.ch/rsync/cms-nanoAOD/jsonpog-integration - synced daily -""" -pog_correction_path = "/cvmfs/cms.cern.ch/rsync/cms-nanoAOD/jsonpog-integration/" -pog_jsons = { - "muon": ["MUO", "muon_Z.json.gz"], - "electron": ["EGM", "electron.json.gz"], - "pileup": ["LUM", "puWeights.json.gz"], - "fatjet_jec": ["JME", "fatJet_jerc.json.gz"], - "jet_jec": ["JME", "jet_jerc.json.gz"], - "jetveto": ["JME", "jetvetomaps.json.gz"], - "btagging": ["BTV", "btagging.json.gz"], -} - - -def get_UL_year(year: str) -> str: - return f"{year}_UL" - - -def get_pog_json(obj: str, year: str) -> str: - try: - pog_json = pog_jsons[obj] - except: - print(f"No json for {obj}") - - year = get_UL_year(year) if year == "2018" else year - if "2022" in year or "2023" in year: - year = { - "2022": "2022_Summer22", - "2022EE": "2022_Summer22EE", - "2023": "2023_Summer23", - "2023BPix": "2023_Summer23BPix", - }[year] - return f"{pog_correction_path}/POG/{pog_json[0]}/{year}/{pog_json[1]}" - - -def add_pileup_weight(weights: Weights, year: str, nPU: np.ndarray, dataset: str | None = None): - # clip nPU from 0 to 100 - nPU = np.clip(nPU, 0, 99) - # print(list(nPU)) - - if "Pu60" in dataset or "Pu70" in dataset: - # pileup profile from data - path_pileup = package_path + "/corrections/data/MyDataPileupHistogram2022FG.root" - pileup_profile = uproot.open(path_pileup)["pileup"] - pileup_profile = pileup_profile.to_numpy()[0] - # normalise - pileup_profile /= pileup_profile.sum() - - # https://indico.cern.ch/event/695872/contributions/2877123/attachments/1593469/2522749/pileup_ppd_feb_2018.pdf - # pileup profile from MC - pu_name = "Pu60" if "Pu60" in dataset else "Pu70" - path_pileup_dataset = package_path + f"/corrections/data/pileup/{pu_name}.npy" - pileup_MC = np.load(path_pileup_dataset) - - # avoid division by 0 (?) - pileup_MC[pileup_MC == 0.0] = 1 - pileup_correction = pileup_profile / pileup_MC - # remove large MC reweighting factors to prevent artifacts - pileup_correction[pileup_correction > 10] = 10 - sf = pileup_correction[nPU] - # no uncertainties - weights.add("pileup", sf) - - else: - # https://twiki.cern.ch/twiki/bin/view/CMS/LumiRecommendationsRun3 - values = {} - - cset = correctionlib.CorrectionSet.from_file(get_pog_json("pileup", year)) - corr = { - "2018": "Collisions18_UltraLegacy_goldenJSON", - "2022": "Collisions2022_355100_357900_eraBCD_GoldenJson", - "2022EE": "Collisions2022_359022_362760_eraEFG_GoldenJson", - "2023": "Collisions2023_366403_369802_eraBC_GoldenJson", - "2023BPix": "Collisions2023_369803_370790_eraD_GoldenJson", - }[year] - # evaluate and clip up to 10 to avoid large weights - values["nominal"] = np.clip(cset[corr].evaluate(nPU, "nominal"), 0, 10) - values["up"] = np.clip(cset[corr].evaluate(nPU, "up"), 0, 10) - values["down"] = np.clip(cset[corr].evaluate(nPU, "down"), 0, 10) - - weights.add("pileup", values["nominal"], values["up"], values["down"]) - - -def get_vpt(genpart, check_offshell=False): - """Only the leptonic samples have no resonance in the decay tree, and only - when M is beyond the configured Breit-Wigner cutoff (usually 15*width) - """ - boson = ak.firsts( - genpart[ - ((genpart.pdgId == 23) | (abs(genpart.pdgId) == 24)) - & genpart.hasFlags(["fromHardProcess", "isLastCopy"]) - ] - ) - if check_offshell: - offshell = genpart[ - genpart.hasFlags(["fromHardProcess", "isLastCopy"]) - & ak.is_none(boson) - & (abs(genpart.pdgId) >= 11) - & (abs(genpart.pdgId) <= 16) - ].sum() - return ak.where(ak.is_none(boson.pt), offshell.pt, boson.pt) - return np.array(ak.fill_none(boson.pt, 0.0)) - - -def add_ps_weight(weights, ps_weights): - """ - Parton Shower Weights (FSR and ISR) - """ - - nweights = len(weights.weight()) - nom = np.ones(nweights) - - up_isr = np.ones(nweights) - down_isr = np.ones(nweights) - up_fsr = np.ones(nweights) - down_fsr = np.ones(nweights) - - if len(ps_weights[0]) == 4: - up_isr = ps_weights[:, 0] # ISR=2, FSR=1 - down_isr = ps_weights[:, 2] # ISR=0.5, FSR=1 - - up_fsr = ps_weights[:, 1] # ISR=1, FSR=2 - down_fsr = ps_weights[:, 3] # ISR=1, FSR=0.5 - - elif len(ps_weights[0]) > 1: - print("PS weight vector has length ", len(ps_weights[0])) - - weights.add("ISRPartonShower", nom, up_isr, down_isr) - weights.add("FSRPartonShower", nom, up_fsr, down_fsr) - - -def get_pdf_weights(events): - """ - For the PDF acceptance uncertainty: - - store 103 variations. 0-100 PDF values - - The last two values: alpha_s variations. - - you just sum the yield difference from the nominal in quadrature to get the total uncertainty. - e.g. https://github.com/LPC-HH/HHLooper/blob/master/python/prepare_card_SR_final.py#L258 - and https://github.com/LPC-HH/HHLooper/blob/master/app/HHLooper.cc#L1488 - - Some references: - Scale/PDF weights in MC https://twiki.cern.ch/twiki/bin/view/CMS/HowToPDF - https://twiki.cern.ch/twiki/bin/viewauth/CMS/TopSystematics#PDF - """ - return events.LHEPdfWeight.to_numpy() - - -def get_scale_weights(events): - """ - QCD Scale variations, best explanation I found is here: - https://twiki.cern.ch/twiki/bin/viewauth/CMS/TopSystematics#Factorization_and_renormalizatio - - TLDR: we want to vary the renormalization and factorization scales by a factor of 0.5 and 2, - and then take the envelope of the variations on our final observation as the up/down uncertainties. - - Importantly, we need to keep track of the normalization for each variation, - so that this uncertainty takes into account the acceptance effects of our selections. - - LHE scale variation weights (w_var / w_nominal) (from https://cms-nanoaod-integration.web.cern.ch/autoDoc/NanoAODv9/2018UL/doc_TTToSemiLeptonic_TuneCP5_13TeV-powheg-pythia8_RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1.html#LHEScaleWeight) - [0] is renscfact=0.5d0 facscfact=0.5d0 ; <= - [1] is renscfact=0.5d0 facscfact=1d0 ; <= - [2] is renscfact=0.5d0 facscfact=2d0 ; - [3] is renscfact=1d0 facscfact=0.5d0 ; <= - [4] is renscfact=1d0 facscfact=1d0 ; - [5] is renscfact=1d0 facscfact=2d0 ; <= - [6] is renscfact=2d0 facscfact=0.5d0 ; - [7] is renscfact=2d0 facscfact=1d0 ; <= - [8] is renscfact=2d0 facscfact=2d0 ; <= - - See also https://git.rwth-aachen.de/3pia/cms_analyses/common/-/blob/11e0c5225416a580d27718997a11dc3f1ec1e8d1/processor/generator.py#L93 for an example. - """ - if len(events[0].LHEScaleWeight) > 0: - if len(events[0].LHEScaleWeight) == 9: - variations = events.LHEScaleWeight[:, [0, 1, 3, 5, 7, 8]].to_numpy() - nominal = events.LHEScaleWeight[:, 4].to_numpy()[:, np.newaxis] - variations /= nominal - else: - variations = events.LHEScaleWeight[:, [0, 1, 3, 4, 6, 7]].to_numpy() - return np.clip(variations, 0.0, 4.0) - else: - return None - - -class JECs: - def __init__(self, year): - if year in ["2022", "2022EE", "2023", "2023BPix"]: - jec_compiled = package_path + "/corrections/jec_compiled.pkl.gz" - elif year in ["2016", "2016APV", "2017", "2018"]: - jec_compiled = package_path + "/corrections/jec_compiled_run2.pkl.gz" - else: - jec_compiled = None - - self.jet_factory = {} - self.met_factory = None - - if jec_compiled is not None: - with gzip.open(jec_compiled, "rb") as filehandler: - jmestuff = pickle.load(filehandler) - - self.jet_factory["ak4"] = jmestuff["jet_factory"] - self.jet_factory["ak8"] = jmestuff["fatjet_factory"] - self.met_factory = jmestuff["met_factory"] - - def _add_jec_variables(self, jets: JetArray, event_rho: ak.Array, isData: bool) -> JetArray: - """add variables needed for JECs""" - jets["pt_raw"] = (1 - jets.rawFactor) * jets.pt - jets["mass_raw"] = (1 - jets.rawFactor) * jets.mass - jets["event_rho"] = ak.broadcast_arrays(event_rho, jets.pt)[0] - if not isData: - # gen pT needed for smearing - jets["pt_gen"] = ak.values_astype(ak.fill_none(jets.matched_gen.pt, 0), np.float32) - return jets - - def get_jec_jets( - self, - events: NanoEventsArray, - jets: FatJetArray, - year: str, - isData: bool = False, - jecs: dict[str, str] | None = None, - fatjets: bool = True, - applyData: bool = False, - dataset: str | None = None, - nano_version: str = "v12", - ) -> FatJetArray: - """ - If ``jecs`` is not None, returns the shifted values of variables are affected by JECs. - """ - - rho = ( - events.Rho.fixedGridRhoFastjetAll - if "Rho" in events.fields - else events.fixedGridRhoFastjetAll - ) - jets = self._add_jec_variables(jets, rho, isData) - - apply_jecs = ak.any(jets.pt) if (applyData or not isData) else False - if "v12" not in nano_version: - apply_jecs = False - if not apply_jecs: - return jets, None - - jec_vars = ["pt"] # variables we are saving that are affected by JECs - jet_factory_str = "ak4" - if fatjets: - jet_factory_str = "ak8" - - if self.jet_factory[jet_factory_str] is None: - print("No factory available") - return jets, None - - import cachetools - - jec_cache = cachetools.Cache(np.inf) - - if isData: - if year == "2022": - corr_key = f"{year}_runCD" - elif year == "2022EE" and "Run2022E" in dataset: - corr_key = f"{year}_runE" - elif year == "2022EE" and "Run2022F" in dataset: - corr_key = f"{year}_runF" - elif year == "2022EE" and "Run2022G" in dataset: - corr_key = f"{year}_runG" - elif year == "2023": - corr_key = "2023_runCv4" if "Run2023Cv4" in dataset else "2023_runCv123" - elif year == "2023BPix": - corr_key = "2023BPix_runD" - else: - print(dataset, year) - print("warning, no valid dataset, JECs won't be applied to data") - applyData = False - else: - corr_key = f"{year}mc" - - # fatjet_factory.build gives an error if there are no jets in event - if apply_jecs: - jets = self.jet_factory[jet_factory_str][corr_key].build(jets, jec_cache) - - # return only jets if no variations are given - if jecs is None or isData: - return jets, None - - jec_shifted_vars = {} - for jec_var in jec_vars: - tdict = {"": jets[jec_var]} - if apply_jecs: - for key, shift in jecs.items(): - for var in ["up", "down"]: - if shift in ak.fields(jets): - tdict[f"{key}_{var}"] = jets[shift][var][jec_var] - jec_shifted_vars[jec_var] = tdict - - return jets, jec_shifted_vars - - -def get_jmsr( - fatjets: FatJetArray, - num_jets: int, - jmsr_vars: list[str], - jms_values: dict, - jmr_values: dict, - isData: bool = False, - seed: int = 42, -) -> dict: - """Calculates post JMS/R masses and shifts""" - - jmsr_shifted_vars = {} - - for mkey in jmsr_vars: - tdict = {} - - mass = pad_val(fatjets[mkey], num_jets, axis=1) - jms = jms_values[mkey] - jmr = jmr_values[mkey] - - if isData: - tdict[""] = mass - else: - rng = np.random.default_rng(seed) - smearing = rng.normal(size=mass.shape) - # scale to JMR nom, down, up (minimum at 0) - jmr_nom, jmr_down, jmr_up = ((smearing * max(jmr[i] - 1, 0) + 1) for i in range(3)) - jms_nom, jms_down, jms_up = jms - - corr_mass_JMRUp = random.gauss(0.0, jmr[2] - 1.0) - corr_mass = max(jmr[0] - 1.0, 0.0) / (jmr[2] - 1.0) * corr_mass_JMRUp - - mass_jms = mass * jms_nom - mass_jmr = mass * jmr_nom - - tdict[""] = mass * jms_nom * (1.0 + corr_mass) - tdict["JMS_down"] = mass_jmr * jms_down - tdict["JMS_up"] = mass_jmr * jms_up - tdict["JMR_down"] = mass_jms * jmr_down - tdict["JMR_up"] = mass_jms * jmr_up - - jmsr_shifted_vars[mkey] = tdict - - return jmsr_shifted_vars - - -# Jet Veto Maps -# the JERC group recommends ALL analyses use these maps, as the JECs are derived excluding these zones. -# apply to both Data and MC -# https://cms-talk.web.cern.ch/t/jet-veto-maps-for-run3-data/18444?u=anmalara -# https://cms-talk.web.cern.ch/t/jes-for-2022-re-reco-cde-and-prompt-fg/32873 -def get_jetveto_event(jets: JetArray, year: str): - """ - Get event selection that rejects events with jets in the veto map - """ - - # correction: Non-zero value for (eta, phi) indicates that the region is vetoed - cset = correctionlib.CorrectionSet.from_file(get_pog_json("jetveto", year)) - j, nj = ak.flatten(jets), ak.num(jets) - - def get_veto(j, nj, csetstr): - j_phi = np.clip(np.array(j.phi), -3.1415, 3.1415) - j_eta = np.clip(np.array(j.eta), -4.7, 4.7) - veto = cset[csetstr].evaluate("jetvetomap", j_eta, j_phi) - return ak.unflatten(veto, nj) - - corr_str = { - "2022": "Summer22_23Sep2023_RunCD_V1", - "2022EE": "Summer22EE_23Sep2023_RunEFG_V1", - "2023": "Summer23Prompt23_RunC_V1", - "2023BPix": "Summer23BPixPrompt23_RunD_V1", - }[year] - - jet_veto = get_veto(j, nj, corr_str) > 0 - - event_sel = ~(ak.any((jets.pt > 15) & jet_veto, axis=1)) - return event_sel diff --git a/src/HH4b/processors/utils.py b/src/HH4b/processors/utils.py deleted file mode 100644 index dc5c26c5..00000000 --- a/src/HH4b/processors/utils.py +++ /dev/null @@ -1,202 +0,0 @@ -""" -Common functions for processors. - -Author(s): Raghav Kansal -""" - -from __future__ import annotations - -import awkward as ak -import numpy as np -from coffea.analysis_tools import PackedSelection - -P4 = { - "eta": "Eta", - "phi": "Phi", - "mass": "Mass", - "pt": "Pt", -} - - -PAD_VAL = -99999 - -jecs = { - "JES": "JES", - "JER": "JER", - # split JECs (unused for now) - # "JES_AbsoluteMPFBias": "JES_AbsoluteMPFBias", # goes in Absolute - # "JES_AbsoluteScale": "JES_AbsoluteScale", # goes in Absolute - # "JES_AbsoluteStat": "JES_AbsoluteStat", # goes in Abs_year - # "JES_FlavorQCD": "JES_FlavorQCD", - # "JES_Fragmentation": "JES_Fragmentation", # goes in Absolute - # "JES_PileUpDataMC": "JES_PileUpDataMC", # goes in Absolute - # "JES_PileUpPtBB": "JES_PileUpPtBB", # goes in BBEC1 - # "JES_PileUpPtEC1": "JES_PileUpPtEC1", # goes in BBEC1 - # "JES_PileUpPtEC2": "JES_PileUpPtEC2", - # "JES_PileUpPtHF": "JES_PileUpPtHF", - # "JES_PileUpPtRef": "JES_PileUpPtRef", # goes in Absolute - # "JES_RelativeFSR": "JES_RelativeFSR", # goes in Absolute - # "JES_RelativeJEREC1": "JES_RelativeJEREC1", # goes in BBEC1_year - # "JES_RelativeJEREC2": "JES_RelativeJEREC2", # goes in EC2_year - # "JES_RelativeJERHF": "JES_RelativeJERHF", # goes in HF - # "JES_RelativePtBB": "JES_RelativePtBB", # goes in BBEC1 - # "JES_RelativePtEC1": "JES_RelativePtEC1", # goes in BBEC1_year - # "JES_RelativePtEC2": "JES_RelativePtEC2", # goes in EC2_year - # "JES_RelativePtHF": "JES_RelativePtHF", # goes in HF - # "JES_RelativeBal": "JES_RelativeBal", - # "JES_RelativeSample": "JES_RelativeSample", - # "JES_RelativeStatEC": "JES_RelativeStatEC", # goes in BBEC1_year - # "JES_RelativeStatFSR": "JES_RelativeStatFSR", # goes in Abs_year - # "JES_RelativeStatHF": "JES_RelativeStatHF", - # "JES_SinglePionHCAL": "JES_SinglePionHCAL", # goes in Absolute - # "JES_SinglePionECAL": "JES_SinglePionECAL", # goes in Absolute - # "JES_TimePtEta": "JES_TimePtEta", # goes in Abs_year -} -jec_shifts = [] -for key in jecs: - for shift in ["up", "down"]: - jec_shifts.append(f"{key}_{shift}") - -jmsr = { - "JMS": "JMS", - "JMR": "JMR", -} -jmsr_shifts = [] -for key in jmsr: - for shift in ["up", "down"]: - jmsr_shifts.append(f"{key}_{shift}") - -# variables affected by JECs -jec_vars = [ - "bbFatJetPt", - "VBFJetPt", - "HHPt", - "HHeta", - "HHmass", - "H1Pt", - "H2Pt", - "H1Pt_HHmass", - "H2Pt_HHmass", - "H1Pt/H2Pt", - "VBFjjMass", - "VBFjjDeltaEta", -] - -# variables affected by JMS/JMR -jmsr_vars = [ - "bbFatJetPNetMassLegacy", - "HHmass", - "H1Pt_HHmass", - "H2Pt_HHmass", - "H1Mass", - "H2Mass", - "H1PNetMass", - "H2PNetMass", -] - - -def pad_val( - arr: ak.Array, - target: int, - value: float = PAD_VAL, - axis: int = 0, - to_numpy: bool = True, - clip: bool = True, -): - """ - pads awkward array up to ``target`` index along axis ``axis`` with value ``value``, - optionally converts to numpy array - """ - ret = ak.fill_none(ak.pad_none(arr, target, axis=axis, clip=clip), value, axis=axis) - return ret.to_numpy() if to_numpy else ret - - -def add_selection( - name: str, - sel: np.ndarray, - selection: PackedSelection, - cutflow: dict, - isData: bool, - genWeights: ak.Array = None, -): - """adds selection to PackedSelection object and the cutflow dictionary""" - if isinstance(sel, ak.Array): - sel = sel.to_numpy() - - selection.add(name, sel.astype(bool)) - cutflow[name] = ( - np.sum(selection.all(*selection.names)) - if isData - # add up genWeights for MC - else np.sum(genWeights[selection.all(*selection.names)]) - ) - - -def add_selection_no_cutflow( - name: str, - sel: np.ndarray, - selection: PackedSelection, -): - """adds selection to PackedSelection object""" - selection.add(name, ak.fill_none(sel, False)) - - -def concatenate_dicts(dicts_list: list[dict[str, np.ndarray]]): - """given a list of dicts of numpy arrays, concatenates the numpy arrays across the lists""" - if len(dicts_list) > 1: - return { - key: np.concatenate( - [ - dicts_list[i][key].reshape(dicts_list[i][key].shape[0], -1) - for i in range(len(dicts_list)) - ], - axis=1, - ) - for key in dicts_list[0] - } - - return dicts_list[0] - - -def select_dicts(dicts_list: list[dict[str, np.ndarray]], sel: np.ndarray): - """given a list of dicts of numpy arrays, select the entries per array across the lists according to ``sel``""" - return { - key: np.stack( - [ - dicts_list[i][key].reshape(dicts_list[i][key].shape[0], -1) - for i in range(len(dicts_list)) - ], - axis=1, - )[sel] - for key in dicts_list[0] - } - - -def remove_variation_suffix(var: str): - """removes the variation suffix from the variable name""" - if var.endswith("Down"): - return var.split("Down")[0] - elif var.endswith("Up"): - return var.split("Up")[0] - return var - - -def check_get_jec_var(var, jshift): - """Checks if var is affected by the JEC / JMSR and if so, returns the shifted var name""" - - if jshift in jec_shifts and var in jec_vars: - return var + "_" + jshift - - if jshift in jmsr_shifts and var in jmsr_vars: - return var + "_" + jshift - - return var - - -def get_var_mapping(jshift): - """Returns function that maps var to shifted var for a given systematic shift [JES|JER|JMS|JMR]_[up|down]""" - - def var_mapping(var): - return check_get_jec_var(var, jshift) - - return var_mapping