tensorflow
diff --git a/‎WORKSPACE
+29-3 b/‎WORKSPACE
+29-3
diff --git a/‎configure.sh
+4-4 b/‎configure.sh
+4-4
diff --git a/‎release/BUILD
+6-1 b/‎release/BUILD
+6-1
diff --git a/‎tensorflow_quantum/core/ops/BUILD
+215-1 b/‎tensorflow_quantum/core/ops/BUILD
+215-1
@@ -24,11 +24,19 @@ cc_library(
         ],
 )
 
+# http_archive(
+#     name = "qsim",
+#     sha256 = "b9c1eba09a885a938b5e73dfc2e02f5231cf3b01d899415caa24769346a731d5",
+#     strip_prefix = "qsim-0.13.3",
+#     urls = ["https://github.com/quantumlib/qsim/archive/refs/tags/v0.13.3.zip"],
+# )
+
+# TODO: After merging this patch later into qsim mainstream, remove this and uncomment the above.
 http_archive(
     name = "qsim",
-    sha256 = "b9c1eba09a885a938b5e73dfc2e02f5231cf3b01d899415caa24769346a731d5",
-    strip_prefix = "qsim-0.13.3",
-    urls = ["https://github.com/quantumlib/qsim/archive/refs/tags/v0.13.3.zip"],
+    sha256 = "",
+    strip_prefix = "qsim-0.15.0-dev20230327_v3",
+    urls = ["https://github.com/jaeyoo/qsim/archive/refs/tags/v0.15.0+dev20230327_v3.tar.gz"],
 )
 
 http_archive(
@@ -73,3 +81,21 @@ bind(
     actual = "@six_archive//:six",
 )
 
+new_local_repository(
+    name = "cuquantum_libs",
+    path = "/usr/local/google/home/jaeyoo/workspace/cuquantum-linux-x86_64-22.11.0.13-archive",
+    build_file_content = """
+cc_library(
+    name = "custatevec_headers",
+    srcs = ["include/custatevec.h"],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "custatevec",
+    srcs = ["lib/libcustatevec.so"],
+    visibility = ["//visibility:public"],
+)
+""",
+)
+
@@ -62,11 +62,11 @@ while [[ "$TF_NEED_CUDA" == "" ]]; do
 done
 
 while [[ "$TF_CUDA_VERSION" == "" ]]; do
-  read -p "Are you building against TensorFlow 2.1(including RCs) or newer?[Y/n] " INPUT
+  read -p "Are you building against TensorFlow 2.11(including RCs) or newer?[Y/n] " INPUT
   case $INPUT in
-    [Yy]* ) echo "Build against TensorFlow 2.1 or newer."; TF_CUDA_VERSION=11;;
-    [Nn]* ) echo "Build against TensorFlow <2.1."; TF_CUDA_VERSION=10.0;;
-    "" ) echo "Build against TensorFlow 2.1 or newer."; TF_CUDA_VERSION=11;;
+    [Yy]* ) echo "Build against TensorFlow 2.11 or newer."; TF_CUDA_VERSION=11;;
+    [Nn]* ) echo "Build against TensorFlow <2.11."; TF_CUDA_VERSION=10.0;;
+    "" ) echo "Build against TensorFlow 2.11 or newer."; TF_CUDA_VERSION=11;;
     * ) echo "Invalid selection: " $INPUT;;
   esac
 done
 
@@ -1,3 +1,5 @@
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
+
 licenses(["notice"])
 
 sh_binary(
@@ -66,5 +68,8 @@ sh_binary(
         "//tensorflow_quantum/python:util",
         "//tensorflow_quantum/python/optimizers:rotosolve_minimizer",
         "//tensorflow_quantum/python/optimizers:spsa_minimizer",
-    ],
+    ] + if_cuda_is_configured([
+        "//tensorflow_quantum/core/ops:tfq_simulate_ops_cuda_py",
+        "//tensorflow_quantum/core/ops:tfq_simulate_ops_cuquantum_py",
+    ]),
 )
@@ -1,4 +1,6 @@
 # load op_wrapper
+load("@org_tensorflow//tensorflow:tensorflow.bzl", "tf_gpu_kernel_library", "tf_gen_op_wrapper_py")
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured", "if_cuda")
 
 package(default_visibility = ["//visibility:public"])
 
@@ -12,6 +14,23 @@ config_setting(
     constraint_values = ["@bazel_tools//platforms:windows"],
 )
 
+cc_library(
+    name = "cuda",
+    data = [
+        "@local_config_cuda//cuda:cudart",
+    ],
+    linkopts = select({
+        ":windows": [],
+        "//conditions:default": [
+            "-Wl,-rpath,../local_config_cuda/cuda/lib64",
+            "-Wl,-rpath,../local_config_cuda/cuda/extras/CUPTI/lib64",
+        ],
+    }),
+    deps = [
+        "@local_config_cuda//cuda:cudart",
+    ],
+)
+
 py_library(
     name = "ops",
     srcs = ["__init__.py"],
@@ -30,7 +49,10 @@ py_library(
         "//tensorflow_quantum/core/ops/math_ops:inner_product_op_py",
         "//tensorflow_quantum/core/ops/math_ops:fidelity_op_py",
         "//tensorflow_quantum/core/ops/noise:noisy_expectation_op_py",
-    ],
+    ] + if_cuda_is_configured([
+        ":tfq_simulate_ops_cuda_py",
+        ":tfq_simulate_ops_cuquantum_py",
+    ]),
 )
 
 cc_binary(
@@ -619,6 +641,198 @@ py_test(
     ],
 )
 
+py_library(
+    name = "tfq_simulate_ops_cuda_py",
+    srcs = ["tfq_simulate_ops_cuda.py"],
+    data = [
+        ":_tfq_simulate_ops_cuda.so",
+    ],
+    srcs_version = "PY3",
+    deps = [
+        # tensorflow framework for wrappers
+        ":load_module",
+    ],
+)
+
+py_library(
+    name = "tfq_simulate_ops_cuquantum_py",
+    srcs = ["tfq_simulate_ops_cuquantum.py"],
+    data = [
+        ":_tfq_simulate_ops_cuquantum.so",
+    ],
+    srcs_version = "PY3",
+    deps = [
+        # tensorflow framework for wrappers
+        ":load_module",
+    ],
+)
+
+py_test(
+    name = "tfq_simulate_ops_gpu_test",
+    srcs = ["tfq_simulate_ops_gpu_test.py"],
+    deps = [
+        ":tfq_simulate_ops_cuda_py",
+        ":tfq_simulate_ops_cuquantum_py",
+        ":tfq_simulate_ops_py",
+        "//tensorflow_quantum/python:util",
+    ],
+    srcs_version = "PY3",
+)
+
+cc_binary(
+    name = "_tfq_simulate_ops_cuda.so",
+    srcs = [
+        "tfq_simulate_expectation_op_cuda.cu.cc",
+    ],
+    linkshared = 1,
+    features = select({
+        ":windows": ["windows_export_all_symbols"],
+        "//conditions:default": [],
+    }),
+    copts = select({
+        ":windows": [
+            "/D__CLANG_SUPPORT_DYN_ANNOTATION__",
+            "/D_USE_MATH_DEFINES",
+            "/DEIGEN_MPL2_ONLY",
+            "/DEIGEN_MAX_ALIGN_BYTES=64",
+            "/DEIGEN_HAS_TYPE_TRAITS=0",
+            "/DTF_USE_SNAPPY",
+            "/showIncludes",
+            "/MD",
+            "/O2",
+            "/DNDEBUG",
+            "/w",
+            "-DWIN32_LEAN_AND_MEAN",
+            "-DNOGDI",
+            "/d2ReducedOptimizeHugeFunctions",
+            "/arch:AVX",
+            "/std:c++17",
+            "-DTENSORFLOW_MONOLITHIC_BUILD",
+            "/DPLATFORM_WINDOWS",
+            "/DEIGEN_HAS_C99_MATH",
+            "/DTENSORFLOW_USE_EIGEN_THREADPOOL",
+            "/DEIGEN_AVOID_STL_ARRAY",
+            "/Iexternal/gemmlowp",
+            "/wd4018",
+            "/wd4577",
+            "/DNOGDI",
+            "/UTF_COMPILE_LIBRARY",
+        ],
+        "//conditions:default": [
+            "-Iexternal/local_cuda/cuda/include",
+            # "--cuda-gpu-arch=sm_86",
+            # "-L/usr/local/cuda/lib64",
+            # "-lcudart_static",
+            # "-ldl",
+            # "-lrt",
+            "-pthread",
+            "-std=c++17",
+            "-D_GLIBCXX_USE_CXX11_ABI=1",
+            "-O3",
+            "-Iexternal/cuda_headers",
+            "-DNV_CUDNN_DISABLE_EXCEPTION",
+            # "-fpermissive",
+        ],
+    }) + if_cuda_is_configured(["-DTENSORFLOW_USE_NVCC=1", "-DGOOGLE_CUDA=1", "-x cuda", "-nvcc_options=relaxed-constexpr", "-nvcc_options=ftz=true"]),
+    deps = [
+        # cirq cc proto
+        "//tensorflow_quantum/core/ops:parse_context",
+        "//tensorflow_quantum/core/ops:tfq_simulate_utils",
+        "//tensorflow_quantum/core/proto:pauli_sum_cc_proto",
+        "//tensorflow_quantum/core/proto:program_cc_proto",
+        "//tensorflow_quantum/core/src:circuit_parser_qsim",
+        "//tensorflow_quantum/core/src:util_qsim",
+        "@eigen//:eigen3",
+        # "@local_cuda//:cuda_headers"
+        # tensorflow core framework
+        # tensorflow core lib
+        # tensorflow core protos
+    ] + if_cuda_is_configured([
+        ":cuda",
+        "@local_config_cuda//cuda:cuda_headers",
+        "@qsim//lib:qsim_cuda_lib",
+    ]),
+    # alwayslink=1,
+)
+
+cc_binary(
+    name = "_tfq_simulate_ops_cuquantum.so",
+    srcs = [
+        "tfq_simulate_expectation_op_cuquantum.cu.cc",
+    ],
+    linkshared = 1,
+    features = select({
+        ":windows": ["windows_export_all_symbols"],
+        "//conditions:default": [],
+    }),
+    copts = select({
+        ":windows": [
+            "/D__CLANG_SUPPORT_DYN_ANNOTATION__",
+            "/D_USE_MATH_DEFINES",
+            "/DEIGEN_MPL2_ONLY",
+            "/DEIGEN_MAX_ALIGN_BYTES=64",
+            "/DEIGEN_HAS_TYPE_TRAITS=0",
+            "/DTF_USE_SNAPPY",
+            "/showIncludes",
+            "/MD",
+            "/O2",
+            "/DNDEBUG",
+            "/w",
+            "-DWIN32_LEAN_AND_MEAN",
+            "-DNOGDI",
+            "/d2ReducedOptimizeHugeFunctions",
+            "/arch:AVX",
+            "/std:c++17",
+            "-DTENSORFLOW_MONOLITHIC_BUILD",
+            "/DPLATFORM_WINDOWS",
+            "/DEIGEN_HAS_C99_MATH",
+            "/DTENSORFLOW_USE_EIGEN_THREADPOOL",
+            "/DEIGEN_AVOID_STL_ARRAY",
+            "/Iexternal/gemmlowp",
+            "/wd4018",
+            "/wd4577",
+            "/DNOGDI",
+            "/UTF_COMPILE_LIBRARY",
+        ],
+        "//conditions:default": [
+            "-Iexternal/local_cuda/cuda/include",
+            # "--cuda-gpu-arch=sm_86",
+            # "-L/usr/local/cuda/lib64",
+            # "-lcudart_static",
+            # "-ldl",
+            # "-lrt",
+            "-pthread",
+            "-std=c++17",
+            "-D_GLIBCXX_USE_CXX11_ABI=1",
+            "-O3",
+            "-Iexternal/cuda_headers",
+            "-DNV_CUDNN_DISABLE_EXCEPTION",
+            # "-fpermissive",
+        ],
+    }) + if_cuda_is_configured(["-DTENSORFLOW_USE_NVCC=1", "-DGOOGLE_CUDA=1", "-x cuda", "-nvcc_options=relaxed-constexpr", "-nvcc_options=ftz=true"]),
+    deps = [
+        # cirq cc proto
+        "//tensorflow_quantum/core/ops:parse_context",
+        "//tensorflow_quantum/core/ops:tfq_simulate_utils",
+        "//tensorflow_quantum/core/proto:pauli_sum_cc_proto",
+        "//tensorflow_quantum/core/proto:program_cc_proto",
+        "//tensorflow_quantum/core/src:circuit_parser_qsim",
+        "//tensorflow_quantum/core/src:util_qsim",
+        "@eigen//:eigen3",
+        # "@local_cuda//:cuda_headers"
+        # tensorflow core framework
+        # tensorflow core lib
+        # tensorflow core protos
+    ] + if_cuda_is_configured([
+        ":cuda",
+        "@cuquantum_libs//:custatevec",
+        "@cuquantum_libs//:custatevec_headers",
+        "@local_config_cuda//cuda:cuda_headers",
+        "@qsim//lib:qsim_cuquantum_lib",
+    ]),
+    # alwayslink=1,
+)
+
 py_library(
     name = "load_module",
     srcs = ["load_module.py"],