Skip to content

feat: cli hs3-to-pyhf functionality #2578

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/pyhf/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def pyhf():
pyhf.add_command(spec.combine)
pyhf.add_command(spec.digest)
pyhf.add_command(spec.sort)
pyhf.add_command(spec.hs3_to_hifa)

# pyhf.add_command(infer.cli)
pyhf.add_command(infer.fit)
Expand Down
337 changes: 337 additions & 0 deletions src/pyhf/cli/spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from pyhf import modifiers
from pyhf import parameters
from pyhf import utils
import re
from math import sqrt

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -408,3 +410,338 @@
with open(output_file, "w+", encoding="utf-8") as out_file:
json.dump(sorted_ws, out_file, indent=4, sort_keys=True)
log.debug(f"Written to {output_file}")


def stripname(name, stripcomponents):
while True:
anymatch = False
for c in stripcomponents:
if name.endswith(c):
name = name[: -len(c)].strip("_")
anymatch = True
break
if name.startswith(c):
name = name[len(c) :].strip("_")
anymatch = True
break
if c in name:
name = name.replace("_" + c + "_", "_")
anymatch = True
break
if not anymatch:
return name


@cli.command()
@click.argument('workspace', default='-')
@click.option(
'--output-file',
help='The location of the output json file. If not specified, prints to screen.',
default=None,
)
@click.option("--poi", default=None, help="Parameter of Interest")
@click.option("--data", default="obsData", help="Name of the dataset")
@click.option("--fix-other-pois", is_flag=True, help="Fix other POIs")
@click.option(
"--nf-blacklist",
multiple=True,
default=["binWidth_.*", "one"],
help="Blacklist for normfactors",
)
@click.option(
"--sys-blacklist", multiple=True, default=["zero"], help="Blacklist for systematics"
)
@click.option(
"--strip-name-components",
multiple=True,
default=["model"],
help="Components to strip from names",
)
@click.option(
"--defactorize", is_flag=True, help="Merge OverallSys and Histosys of the same name"
)
def hs3_to_hifa(
workspace,
output_file,
poi,
data,
fix_other_pois,
nf_blacklist,
sys_blacklist,
strip_name_components,
defactorize,
):
"""
Convert the HS3 workspace to a HiFa JSON workspace for use with pyhf.

Taken from: https://gitlab.cern.ch/cburgard/RooFitUtils/-/blob/bbb079c1f597b138e2b638b0d98cff3835596240/scripts/json-roofit2pyhf.py

Example:

.. code-block:: shell

$ curl -sL https://raw.githubusercontent.com/root-project/root/12b7ffe/tutorials/roofit/roofit/rf515_hfJSON.json | pyhf sort | jq '.' | md5
8be5186ec249d2704e14dd29ef05ffb0

"""
with click.open_file(workspace, "r", encoding="utf-8") as specstream:
spec = json.load(specstream)

variables = spec.get("variables", {})
bounds = {}
for domain in spec.get("domains", []):
for parameter in domain["axes"]:
bounds[parameter["name"]] = [parameter["min"], parameter["max"]]

# setup the main structure
pois = set()
for analysis in spec["analyses"]:
for poi in analysis["parameters_of_interest"]:
pois.add(poi)
measurement = {"name": "meas", "config": {"parameters": []}}
if poi:
measurement["config"]["poi"] = poi
elif len(pois) > 0:
measurement["config"]["poi"] = next(iter(pois))
output_json = {
"channels": [],
"measurements": [measurement],
"observations": [],
"version": "1.0.0",
}

# some bookkeeping
nps = set()
nfs = set()
channelnames = []

# define observations / data
this_data = {}
for key, channel in this_data.items():
if not isinstance(channel, dict):
continue
channelname = stripname(key, strip_name_components)
channelnames.append(channelname)
if "counts" in channel.keys():
output_json["observations"].append(
{"data": channel["counts"], "name": channelname}
)
else:
output_json["observations"].append(
{"data": channel["weights"], "name": channelname}
)

observations = []
parameters = {}

# define model /pdf
for pdf in sorted(spec.get("distributions", []), key=lambda x: x["name"]):
if pdf["type"] != "histfactory_dist":
continue
if "name" in pdf.keys():
key = pdf["name"]
channelname = stripname(key, strip_name_components)
for c in channelnames:
if c.startswith(channelname):
channelname = c

for any_data in spec["data"]:
if data in any_data["name"] and channelname in any_data["name"]:
observations.append({"data": any_data["contents"], "name": channelname})

out_channel = {"name": channelname, "samples": []}
output_json["channels"].append(out_channel)

if "samples" not in pdf.keys():
print("workspace is no histfactory workspace")
exit(1)

sum_values = None
sum_errors2 = None

for sample in pdf["samples"]:
if "data" not in sample.keys():
print("workspace no histfactory workspace")
exit(1)
has_staterror = False
for modifier in sample["modifiers"]:
if modifier["type"] == "staterror":
has_staterror = True

if has_staterror:
values = sample["data"]["contents"]
if sum_values:
for i in range(len(sum_values)):
sum_values[i] += values[i]
else:
sum_values = [v for v in values]

errors = sample["data"]["errors"]
if sum_errors2:
for i in range(len(sum_errors2)):
sum_errors2[i] += errors[i] * errors[i]
else:
sum_errors2 = [e * e for e in errors]

for sample in sorted(pdf["samples"], key=lambda x: x["name"]):
values = sample["data"]["contents"]
bins = len(values)

out_sample = {
"name": stripname(sample["name"], strip_name_components),
"modifiers": [],
}
out_channel["samples"].append(out_sample)
out_sample["data"] = values

modifiers = out_sample["modifiers"]
for modifier in sorted(sample["modifiers"], key=lambda x: x["name"]):
if modifier.get("constraint", None) == "Const":
continue
if modifier["name"] == "Lumi":
modifiers.append({"name": "lumi", "type": "lumi", "data": None})
nps.add("lumi")
parameters["lumi"] = {
"fixed": False,
"inits": [1.0],
"auxdata": [1.0],
**(
{"bounds": [bounds[modifier["name"]]]}
if modifier["name"] in bounds
else {}
),
}
elif modifier["type"] == "staterror":
parname = "staterror_" + channelname
modifiers.append(
{
"name": parname,
"type": "staterror",
"data": [
sqrt(sum_errors2[i]) / sum_values[i] * values[i]
for i in range(bins)
],
}
)
nps.add(parname)
parameters[parname] = {
"fixed": False,
"auxdata": [1.0 for i in range(bins)],
"inits": [1.0 for i in range(bins)],
"bounds": [[-5.0, 5.0] for i in range(bins)],
}
elif modifier["type"] == "normfactor":
modifiers.append(
{
"name": modifier["name"],
"type": "normfactor",
"data": None,
}
)
nfs.add(modifier["name"])
parameters[modifier["name"]] = {
"fixed": False,
"inits": [1.0],
**(
{"bounds": [bounds[modifier["name"]]]}
if modifier["name"] in bounds
else {}
),
}
elif modifier["type"] == "normsys":
modifiers.append(
{
"name": modifier["name"],
"type": "normsys",
"data": modifier["data"],
}
)
nps.add(modifier["name"])
parameters[modifier["name"]] = {
"fixed": False,
"auxdata": [0.0],
"inits": [0.0],
**(
{"bounds": [bounds[modifier["name"]]]}
if modifier["name"] in bounds
else {}
),
}
elif modifier["type"] == "shapesys":
parname = modifier["name"]
nps.add(parname)
modifiers.append(
{
"name": parname,
"type": "shapesys",
"data": modifier["data"]["vals"],
}
)
parameters[parname] = {
"fixed": False,
"auxdata": [1.0 for i in range(bins)],
"inits": [0.0 for i in range(bins)],
"bounds": [[-5.0, 5.0] for i in range(bins)],
}
if bins == 9:
print(parname)
elif modifier["type"] == "histosys":
modifiers.append(
{
"name": modifier["name"],
"type": "histosys",
"data": {
"hi_data": modifier["data"]["hi"]["contents"],
"lo_data": modifier["data"]["lo"]["contents"],
},
}
)
nps.add(modifier["name"])
parameters[modifier["name"]] = {
"fixed": False,
"auxdata": [0.0],
"inits": [0.0],
**(
{"bounds": [bounds[modifier["name"]]]}
if modifier["name"] in bounds
else {}
),
}
else:
print(
f"workspace contains unknown modifier type: {modifier['type']}"
)
exit(1)

# define parameters
for par in nps:
parameters[par]["name"] = par
measurement["config"]["parameters"].append(parameters[par])
for par in nfs:
if any([re.match(b, par) for b in nf_blacklist]):
continue
parameters[par]["name"] = par
measurement["config"]["parameters"].append(parameters[par])

# some post-processing
for p in measurement["config"]["parameters"]:
pname = p["name"]
if pname in variables.keys():
if "const" in variables[pname].keys():
p["fixed"] = variables[pname]["const"]
if fix_other_pois:
for this_poi in pois:
if this_poi == poi:
continue
if pname == this_poi:
p["fixed"] = True

# write observations
output_json["observations"] = observations

if output_file is None:
click.echo(json.dumps(output_json, indent=4, sort_keys=True))
else:
with open(output_file, "w+", encoding="utf-8") as out_file:
json.dump(output_json, out_file, indent=4, sort_keys=True)
log.debug(f"Written to {output_file}")

Check warning on line 747 in src/pyhf/cli/spec.py

View check run for this annotation

codefactor.io / CodeFactor

src/pyhf/cli/spec.py#L463-L747

Very Complex Method
Loading