scikit-hep · kratsg · Mar 12, 2025
diff --git a/src/pyhf/cli/cli.py b/src/pyhf/cli/cli.py
@@ -47,6 +47,7 @@ def pyhf():
 pyhf.add_command(spec.combine)
 pyhf.add_command(spec.digest)
 pyhf.add_command(spec.sort)
+pyhf.add_command(spec.hs3_to_hifa)
 
 # pyhf.add_command(infer.cli)
 pyhf.add_command(infer.fit)

diff --git a/src/pyhf/cli/spec.py b/src/pyhf/cli/spec.py
@@ -9,6 +9,8 @@
 from pyhf import modifiers
 from pyhf import parameters
 from pyhf import utils
+import re
+from math import sqrt
 
 log = logging.getLogger(__name__)
 
@@ -408,3 +410,338 @@
         with open(output_file, "w+", encoding="utf-8") as out_file:
             json.dump(sorted_ws, out_file, indent=4, sort_keys=True)
         log.debug(f"Written to {output_file}")
+
+
+def stripname(name, stripcomponents):
+    while True:
+        anymatch = False
+        for c in stripcomponents:
+            if name.endswith(c):
+                name = name[: -len(c)].strip("_")
+                anymatch = True
+                break
+            if name.startswith(c):
+                name = name[len(c) :].strip("_")
+                anymatch = True
+                break
+            if c in name:
+                name = name.replace("_" + c + "_", "_")
+                anymatch = True
+                break
+        if not anymatch:
+            return name
+
+
+@cli.command()
+@click.argument('workspace', default='-')
+@click.option(
+    '--output-file',
+    help='The location of the output json file. If not specified, prints to screen.',
+    default=None,
+)
+@click.option("--poi", default=None, help="Parameter of Interest")
+@click.option("--data", default="obsData", help="Name of the dataset")
+@click.option("--fix-other-pois", is_flag=True, help="Fix other POIs")
+@click.option(
+    "--nf-blacklist",
+    multiple=True,
+    default=["binWidth_.*", "one"],
+    help="Blacklist for normfactors",
+)
+@click.option(
+    "--sys-blacklist", multiple=True, default=["zero"], help="Blacklist for systematics"
+)
+@click.option(
+    "--strip-name-components",
+    multiple=True,
+    default=["model"],
+    help="Components to strip from names",
+)
+@click.option(
+    "--defactorize", is_flag=True, help="Merge OverallSys and Histosys of the same name"
+)
+def hs3_to_hifa(
+    workspace,
+    output_file,
+    poi,
+    data,
+    fix_other_pois,
+    nf_blacklist,
+    sys_blacklist,
+    strip_name_components,
+    defactorize,
+):
+    """
+    Convert the HS3 workspace to a HiFa JSON workspace for use with pyhf.
+
+    Taken from: https://gitlab.cern.ch/cburgard/RooFitUtils/-/blob/bbb079c1f597b138e2b638b0d98cff3835596240/scripts/json-roofit2pyhf.py
+
+    Example:
+
+    .. code-block:: shell
+
+        $ curl -sL https://raw.githubusercontent.com/root-project/root/12b7ffe/tutorials/roofit/roofit/rf515_hfJSON.json | pyhf sort | jq '.' | md5
+        8be5186ec249d2704e14dd29ef05ffb0
+
+    """
+    with click.open_file(workspace, "r", encoding="utf-8") as specstream:
+        spec = json.load(specstream)
+
+    variables = spec.get("variables", {})
+    bounds = {}
+    for domain in spec.get("domains", []):
+        for parameter in domain["axes"]:
+            bounds[parameter["name"]] = [parameter["min"], parameter["max"]]
+
+    # setup the main structure
+    pois = set()
+    for analysis in spec["analyses"]:
+        for poi in analysis["parameters_of_interest"]:
+            pois.add(poi)
+    measurement = {"name": "meas", "config": {"parameters": []}}
+    if poi:
+        measurement["config"]["poi"] = poi
+    elif len(pois) > 0:
+        measurement["config"]["poi"] = next(iter(pois))
+    output_json = {
+        "channels": [],
+        "measurements": [measurement],
+        "observations": [],
+        "version": "1.0.0",
+    }
+
+    # some bookkeeping
+    nps = set()
+    nfs = set()
+    channelnames = []
+
+    # define observations / data
+    this_data = {}
+    for key, channel in this_data.items():
+        if not isinstance(channel, dict):
+            continue
+        channelname = stripname(key, strip_name_components)
+        channelnames.append(channelname)
+        if "counts" in channel.keys():
+            output_json["observations"].append(
+                {"data": channel["counts"], "name": channelname}
+            )
+        else:
+            output_json["observations"].append(
+                {"data": channel["weights"], "name": channelname}
+            )
+
+    observations = []
+    parameters = {}
+
+    # define model /pdf
+    for pdf in sorted(spec.get("distributions", []), key=lambda x: x["name"]):
+        if pdf["type"] != "histfactory_dist":
+            continue
+        if "name" in pdf.keys():
+            key = pdf["name"]
+        channelname = stripname(key, strip_name_components)
+        for c in channelnames:
+            if c.startswith(channelname):
+                channelname = c
+
+        for any_data in spec["data"]:
+            if data in any_data["name"] and channelname in any_data["name"]:
+                observations.append({"data": any_data["contents"], "name": channelname})
+
+        out_channel = {"name": channelname, "samples": []}
+        output_json["channels"].append(out_channel)
+
+        if "samples" not in pdf.keys():
+            print("workspace is no histfactory workspace")
+            exit(1)
+
+        sum_values = None
+        sum_errors2 = None
+
+        for sample in pdf["samples"]:
+            if "data" not in sample.keys():
+                print("workspace no histfactory workspace")
+                exit(1)
+            has_staterror = False
+            for modifier in sample["modifiers"]:
+                if modifier["type"] == "staterror":
+                    has_staterror = True
+
+            if has_staterror:
+                values = sample["data"]["contents"]
+                if sum_values:
+                    for i in range(len(sum_values)):
+                        sum_values[i] += values[i]
+                else:
+                    sum_values = [v for v in values]
+
+                errors = sample["data"]["errors"]
+                if sum_errors2:
+                    for i in range(len(sum_errors2)):
+                        sum_errors2[i] += errors[i] * errors[i]
+                else:
+                    sum_errors2 = [e * e for e in errors]
+
+        for sample in sorted(pdf["samples"], key=lambda x: x["name"]):
+            values = sample["data"]["contents"]
+            bins = len(values)
+
+            out_sample = {
+                "name": stripname(sample["name"], strip_name_components),
+                "modifiers": [],
+            }
+            out_channel["samples"].append(out_sample)
+            out_sample["data"] = values
+
+            modifiers = out_sample["modifiers"]
+            for modifier in sorted(sample["modifiers"], key=lambda x: x["name"]):
+                if modifier.get("constraint", None) == "Const":
+                    continue
+                if modifier["name"] == "Lumi":
+                    modifiers.append({"name": "lumi", "type": "lumi", "data": None})
+                    nps.add("lumi")
+                    parameters["lumi"] = {
+                        "fixed": False,
+                        "inits": [1.0],
+                        "auxdata": [1.0],
+                        **(
+                            {"bounds": [bounds[modifier["name"]]]}
+                            if modifier["name"] in bounds
+                            else {}
+                        ),
+                    }
+                elif modifier["type"] == "staterror":
+                    parname = "staterror_" + channelname
+                    modifiers.append(
+                        {
+                            "name": parname,
+                            "type": "staterror",
+                            "data": [
+                                sqrt(sum_errors2[i]) / sum_values[i] * values[i]
+                                for i in range(bins)
+                            ],
+                        }
+                    )
+                    nps.add(parname)
+                    parameters[parname] = {
+                        "fixed": False,
+                        "auxdata": [1.0 for i in range(bins)],
+                        "inits": [1.0 for i in range(bins)],
+                        "bounds": [[-5.0, 5.0] for i in range(bins)],
+                    }
+                elif modifier["type"] == "normfactor":
+                    modifiers.append(
+                        {
+                            "name": modifier["name"],
+                            "type": "normfactor",
+                            "data": None,
+                        }
+                    )
+                    nfs.add(modifier["name"])
+                    parameters[modifier["name"]] = {
+                        "fixed": False,
+                        "inits": [1.0],
+                        **(
+                            {"bounds": [bounds[modifier["name"]]]}
+                            if modifier["name"] in bounds
+                            else {}
+                        ),
+                    }
+                elif modifier["type"] == "normsys":
+                    modifiers.append(
+                        {
+                            "name": modifier["name"],
+                            "type": "normsys",
+                            "data": modifier["data"],
+                        }
+                    )
+                    nps.add(modifier["name"])
+                    parameters[modifier["name"]] = {
+                        "fixed": False,
+                        "auxdata": [0.0],
+                        "inits": [0.0],
+                        **(
+                            {"bounds": [bounds[modifier["name"]]]}
+                            if modifier["name"] in bounds
+                            else {}
+                        ),
+                    }
+                elif modifier["type"] == "shapesys":
+                    parname = modifier["name"]
+                    nps.add(parname)
+                    modifiers.append(
+                        {
+                            "name": parname,
+                            "type": "shapesys",
+                            "data": modifier["data"]["vals"],
+                        }
+                    )
+                    parameters[parname] = {
+                        "fixed": False,
+                        "auxdata": [1.0 for i in range(bins)],
+                        "inits": [0.0 for i in range(bins)],
+                        "bounds": [[-5.0, 5.0] for i in range(bins)],
+                    }
+                    if bins == 9:
+                        print(parname)
+                elif modifier["type"] == "histosys":
+                    modifiers.append(
+                        {
+                            "name": modifier["name"],
+                            "type": "histosys",
+                            "data": {
+                                "hi_data": modifier["data"]["hi"]["contents"],
+                                "lo_data": modifier["data"]["lo"]["contents"],
+                            },
+                        }
+                    )
+                    nps.add(modifier["name"])
+                    parameters[modifier["name"]] = {
+                        "fixed": False,
+                        "auxdata": [0.0],
+                        "inits": [0.0],
+                        **(
+                            {"bounds": [bounds[modifier["name"]]]}
+                            if modifier["name"] in bounds
+                            else {}
+                        ),
+                    }
+                else:
+                    print(
+                        f"workspace contains unknown modifier type: {modifier['type']}"
+                    )
+                    exit(1)
+
+    # define parameters
+    for par in nps:
+        parameters[par]["name"] = par
+        measurement["config"]["parameters"].append(parameters[par])
+    for par in nfs:
+        if any([re.match(b, par) for b in nf_blacklist]):
+            continue
+        parameters[par]["name"] = par
+        measurement["config"]["parameters"].append(parameters[par])
+
+    # some post-processing
+    for p in measurement["config"]["parameters"]:
+        pname = p["name"]
+        if pname in variables.keys():
+            if "const" in variables[pname].keys():
+                p["fixed"] = variables[pname]["const"]
+        if fix_other_pois:
+            for this_poi in pois:
+                if this_poi == poi:
+                    continue
+                if pname == this_poi:
+                    p["fixed"] = True
+
+    # write observations
+    output_json["observations"] = observations
+
+    if output_file is None:
+        click.echo(json.dumps(output_json, indent=4, sort_keys=True))
+    else:
+        with open(output_file, "w+", encoding="utf-8") as out_file:
+            json.dump(output_json, out_file, indent=4, sort_keys=True)
+        log.debug(f"Written to {output_file}")