Skip to content

Commit cc633ce

Browse files
authored
Merge pull request #105 from ChrisCummins/fix/ProGraML/57
ml4pl/llvm2graph: Add support for struct inlining
2 parents 50e2e77 + 8b14bd8 commit cc633ce

File tree

5 files changed

+99
-4
lines changed

5 files changed

+99
-4
lines changed

deeplearning/ml4pl/graphs/llvm2graph/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@ py_binary(
146146
"//labm8/py:app",
147147
"//labm8/py:bazelutil",
148148
"//labm8/py:decorators",
149+
"//labm8/py:fs",
149150
"//third_party/py/numpy",
150151
],
151152
)

deeplearning/ml4pl/graphs/llvm2graph/node_encoder.py

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@
1717
1818
When executed as a binary, this program reads a single program graph from
1919
stdin, encodes it, and writes a graph to stdout. Use --stdin_fmt and
20-
--stdout_fmt to convert between different graph types.
20+
--stdout_fmt to convert between different graph types, and --ir to read the
21+
IR file that the graph was constructed from, required for resolving struct
22+
definitions.
2123
2224
Example usage:
2325
@@ -26,10 +28,12 @@
2628
$ bazel run //deeplearning/ml4pl/graphs/llvm2graph:node_encoder -- \
2729
--stdin_fmt=pb \
2830
--stdout_fmt=pbtxt \
31+
--ir=/tmp/source.ll \
2932
< /tmp/proto.pb > /tmp/proto.pbtxt
3033
"""
3134
import pickle
3235
from typing import List
36+
from typing import Optional
3337

3438
import networkx as nx
3539
import numpy as np
@@ -40,9 +44,17 @@
4044
from labm8.py import app
4145
from labm8.py import bazelutil
4246
from labm8.py import decorators
47+
from labm8.py import fs
4348

4449

4550
FLAGS = app.FLAGS
51+
app.DEFINE_output_path(
52+
"ir",
53+
None,
54+
"The path of the IR file that was used to construct the graph. This is "
55+
"required to inline struct definitions. This argument may be omitted when "
56+
"struct definitions do not need to be inlined.",
57+
)
4658

4759
DICTIONARY = bazelutil.DataPath(
4860
"phd/deeplearning/ml4pl/graphs/llvm2graph/node_embeddings/inst2vec_augmented_dictionary.pickle"
@@ -69,20 +81,35 @@ def __init__(self):
6981
with open(str(AUGMENTED_INST2VEC_EMBEDDINGS), "rb") as f:
7082
self.node_text_embeddings = pickle.load(f)
7183

72-
def EncodeNodes(self, g: nx.DiGraph) -> None:
84+
def EncodeNodes(self, g: nx.DiGraph, ir: Optional[str] = None) -> None:
7385
"""Pre-process the node text and set the text embedding index.
7486
7587
For each node, this sets the 'preprocessed_text', 'x', and 'y' attributes.
7688
7789
Args:
7890
g: The graph to encode the nodes of.
91+
ir: The LLVM IR that was used to construct the graph. This is required for
92+
struct inlining. If struct inlining is not required, this may be
93+
omitted.
7994
"""
8095
# Pre-process the statements of the graph in a single pass.
8196
lines = [
8297
[data["text"]]
8398
for _, data in g.nodes(data=True)
8499
if data["type"] == programl_pb2.Node.STATEMENT
85100
]
101+
102+
if ir:
103+
# NOTE(github.com/ChrisCummins/ProGraML/issues/57): Extract the struct
104+
# definitions from the IR and inline their definitions in place of the
105+
# struct names. This is brittle string substitutions, in the future we
106+
# should do this inlining in llvm2graph where we have a parsed
107+
# llvm::Module.
108+
structs = inst2vec_preprocess.GetStructTypes(ir)
109+
for line in lines:
110+
for struct, definition in structs.items():
111+
line[0] = line[0].replace(struct, definition)
112+
86113
preprocessed_lines, _ = inst2vec_preprocess.preprocess(lines)
87114
preprocessed_texts = [
88115
inst2vec_preprocess.PreprocessStatement(x[0] if len(x) else "")
@@ -122,7 +149,8 @@ def Main():
122149
proto = programl.ReadStdin()
123150
g = programl.ProgramGraphToNetworkX(proto)
124151
encoder = GraphNodeEncoder()
125-
encoder.EncodeNodes(g)
152+
ir = fs.Read(FLAGS.ir) if FLAGS.ir else None
153+
encoder.EncodeNodes(g, ir=ir)
126154
programl.WriteStdout(programl.NetworkXToProgramGraph(g))
127155

128156

deeplearning/ncc/inst2vec/inst2vec_preprocess.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import os
2626
import pickle
2727
import re
28+
from typing import Dict
2829

2930
import networkx as nx
3031

@@ -2959,6 +2960,21 @@ def inline_struct_types_in_file(G, dic, specific_struct_name_pattern):
29592960
return G
29602961

29612962

2963+
def GetStructTypes(ir: str) -> Dict[str, str]:
2964+
"""Extract a dictionary of struct definitions from the given IR.
2965+
2966+
Args:
2967+
ir: A string of LLVM IR.
2968+
2969+
Returns:
2970+
A dictionary of <name, def> entries, where <name> is the name of a struct
2971+
definition (e.g. "%struct.foo"), and <def> is the definition of the member
2972+
types, e.g. "{ i32 }".
2973+
"""
2974+
_, dict_temp = construct_struct_types_dictionary_for_file(ir.split("\n"))
2975+
return dict_temp
2976+
2977+
29622978
def inline_struct_types(
29632979
G, data_with_struct_def, file_name, specific_struct_name_pattern
29642980
):

tools/BUILD

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22
# Some of these scripts are meant for execution in-tree, and cannot be run
33
# using bazel, please see the comments.
44

5+
exports_files([
6+
"bazel",
7+
])
8+
59
sh_binary(
610
name = "whoami",
711
srcs = ["whoami.sh"],

tools/bazel

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,52 @@
2121
# Your mileage may vary.
2222
set -eu
2323

24+
build_path() {
25+
local _dir=""
26+
local _path=""
27+
28+
for _dir in "$@"
29+
do
30+
if [ -d $_dir ]; then
31+
_path=$_path:$_dir
32+
fi
33+
done
34+
35+
_path=${_path:1}
36+
echo $_path
37+
}
38+
39+
# Accepts an array of directories and returns a colon separated path
40+
# of all of the directories that exist, in order. Example usage:
41+
#
42+
# dirs=("/usr/local/bin" /usr/bin "/not a real path")
43+
# unset FOO
44+
# FOO=$(build_path "${dirs[@]}")
45+
# echo $FOO
46+
# # Outputs: /usr/local/bin:/usr/bin
47+
path_dirs=( \
48+
/usr/local/opt/llvm/bin \
49+
/usr/local/opt/gnu-sed/libexec/gnubin \
50+
/usr/bin \
51+
/usr/local/bin \
52+
/bin \
53+
)
54+
55+
if [[ -f "/usr/local/opt/llvm/bin/clang" ]]; then
56+
CC=/usr/local/opt/llvm/bin/clang
57+
CXX=/usr/local/opt/llvm/bin/clang++
58+
else
59+
CC="$(which gcc)"
60+
CXX="$(which g++)"
61+
fi
62+
2463
# PULLET_TIMEOUT to increase the timeout on docker image pulls from the default
2564
# 600s. See: https://github.com/bazelbuild/rules_docker
26-
env -i TERM="$TERM" PATH=/usr/local/opt/gnu-sed/libexec/gnubin:/usr/bin:/usr/local/bin:/bin PULLER_TIMEOUT=3600 "$BAZEL_REAL" "$@"
65+
set +u
66+
env -i \
67+
TERM="$TERM" \
68+
PATH="$(build_path ${path_dirs[@]})" \
69+
CC=$CC \
70+
CXX=$CXX \
71+
PULLER_TIMEOUT=3600 \
72+
"$BAZEL_REAL" "$@"

0 commit comments

Comments
 (0)