tensorflow
diff --git a/‎scripts/ci_install.sh
+2-2 b/‎scripts/ci_install.sh
+2-2
diff --git a/‎tensorflow_quantum/core/ops/tfq_simulate_expectation_op_cuda.cu.cc
+207 b/‎tensorflow_quantum/core/ops/tfq_simulate_expectation_op_cuda.cu.cc
+207
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-wget https://github.com/bazelbuild/bazel/releases/download/5.1.0/bazel_5.1.0-linux-x86_64.deb
-sudo dpkg -i bazel_5.1.0-linux-x86_64.deb
+wget https://github.com/bazelbuild/bazel/releases/download/5.3.0/bazel_5.3.0-linux-x86_64.deb
+sudo dpkg -i bazel_5.3.0-linux-x86_64.deb
 pip install --upgrade pip setuptools wheel
 pip install -r requirements.txt
@@ -0,0 +1,207 @@
+/* Copyright 2020 The TensorFlow Quantum Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <vector>
+
+#include <chrono>
+
+#include "../qsim/lib/circuit.h"
+#include "../qsim/lib/gate_appl.h"
+#include "../qsim/lib/gates_cirq.h"
+#include "../qsim/lib/gates_qsim.h"
+#include "../qsim/lib/seqfor.h"
+#include "../qsim/lib/simulator_cuda.h"
+#include "../qsim/lib/statespace_cuda.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow_quantum/core/ops/parse_context.h"
+#include "tensorflow_quantum/core/proto/pauli_sum.pb.h"
+#include "tensorflow_quantum/core/proto/program.pb.h"
+#include "tensorflow_quantum/core/src/util_qsim.h"
+
+namespace tfq {
+
+using ::tensorflow::Status;
+using ::tfq::proto::PauliSum;
+using ::tfq::proto::Program;
+
+typedef qsim::Cirq::GateCirq<float> QsimGate;
+typedef qsim::Circuit<QsimGate> QsimCircuit;
+
+
+class TfqSimulateExpectationOpCuda : public tensorflow::OpKernel {
+ public:
+  explicit TfqSimulateExpectationOpCuda(tensorflow::OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(tensorflow::OpKernelContext* context) override {
+    // TODO (mbbrough): add more dimension checks for other inputs here.
+    const int num_inputs = context->num_inputs();
+    OP_REQUIRES(context, num_inputs == 4,
+                tensorflow::errors::InvalidArgument(absl::StrCat(
+                    "Expected 4 inputs, got ", num_inputs, " inputs.")));
+
+    // Create the output Tensor.
+    const int output_dim_batch_size = context->input(0).dim_size(0);
+    const int output_dim_op_size = context->input(3).dim_size(1);
+    tensorflow::TensorShape output_shape;
+    output_shape.AddDim(output_dim_batch_size);
+    output_shape.AddDim(output_dim_op_size);
+
+    tensorflow::Tensor* output = nullptr;
+    tensorflow::AllocatorAttributes alloc_attr;
+    alloc_attr.set_on_host(true);
+    alloc_attr.set_gpu_compatible(true);
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output,
+                                                     alloc_attr));
+    auto output_tensor = output->matrix<float>();
+    // Parse program protos.
+    std::vector<Program> programs;
+    std::vector<int> num_qubits;
+    std::vector<std::vector<PauliSum>> pauli_sums;
+    OP_REQUIRES_OK(context, GetProgramsAndNumQubits(context, &programs,
+                                                    &num_qubits, &pauli_sums));
+
+    std::vector<SymbolMap> maps;
+    OP_REQUIRES_OK(context, GetSymbolMaps(context, &maps));
+
+    OP_REQUIRES(context, programs.size() == maps.size(),
+                tensorflow::errors::InvalidArgument(absl::StrCat(
+                    "Number of circuits and symbol_values do not match. Got ",
+                    programs.size(), " circuits and ", maps.size(),
+                    " symbol values.")));
+
+    // Construct qsim circuits.
+    std::vector<QsimCircuit> qsim_circuits(programs.size(), QsimCircuit());
+    std::vector<std::vector<qsim::GateFused<QsimGate>>> fused_circuits(
+        programs.size(), std::vector<qsim::GateFused<QsimGate>>({}));
+
+    Status parse_status = Status();
+    auto p_lock = tensorflow::mutex();
+    auto construct_f = [&](int start, int end) {
+      for (int i = start; i < end; i++) {
+        Status local =
+            QsimCircuitFromProgram(programs[i], maps[i], num_qubits[i],
+                                   &qsim_circuits[i], &fused_circuits[i]);
+        NESTED_FN_STATUS_SYNC(parse_status, local, p_lock);
+      }
+    };
+
+    const int num_cycles = 1000;
+    context->device()->tensorflow_cpu_worker_threads()->workers->ParallelFor(
+        programs.size(), num_cycles, construct_f);
+    OP_REQUIRES_OK(context, parse_status);
+
+    int max_num_qubits = 0;
+    for (const int num : num_qubits) {
+      max_num_qubits = std::max(max_num_qubits, num);
+    }
+    ComputeLarge(num_qubits, fused_circuits, pauli_sums, context,
+                 &output_tensor);
+  }
+
+ private:
+  int num_threads_in_sim_;
+  int block_count_;
+
+  // Define the GPU implementation that launches the CUDA kernel.
+  void ComputeLarge(
+      const std::vector<int>& num_qubits,
+      const std::vector<std::vector<qsim::GateFused<QsimGate>>>& fused_circuits,
+      const std::vector<std::vector<PauliSum>>& pauli_sums,
+      tensorflow::OpKernelContext* context,
+      tensorflow::TTypes<float, 1>::Matrix* output_tensor) {
+    // Instantiate qsim objects.
+    using Simulator = qsim::SimulatorCUDA<float>;
+    using StateSpace = Simulator::StateSpace;
+    // Begin simulation with default parameters.
+    int largest_nq = 1;
+    Simulator sim = Simulator();
+    StateSpace ss = StateSpace(StateSpace::Parameter());
+    auto sv = ss.Create(largest_nq);
+    auto scratch = ss.Create(largest_nq);
+
+    // Simulate programs one by one. Parallelizing over state vectors
+    // we no longer parallelize over circuits. Each time we encounter a
+    // a larger circuit we will grow the Statevector as necessary.
+    for (int i = 0; i < fused_circuits.size(); i++) {
+      int nq = num_qubits[i];
+
+      if (nq > largest_nq) {
+        // need to switch to larger statespace.
+        largest_nq = nq;
+        sv = ss.Create(largest_nq);
+        scratch = ss.Create(largest_nq);
+      }
+      // TODO: add heuristic here so that we do not always recompute
+      //  the state if there is a possibility that circuit[i] and
+      //  circuit[i + 1] produce the same state.
+      ss.SetStateZero(sv);
+      for (int j = 0; j < fused_circuits[i].size(); j++) {
+        qsim::ApplyFusedGate(sim, fused_circuits[i][j], sv);
+      }
+      for (int j = 0; j < pauli_sums[i].size(); j++) {
+        // (#679) Just ignore empty program
+        if (fused_circuits[i].size() == 0) {
+          (*output_tensor)(i, j) = -2.0;
+          continue;
+        }
+        float exp_v = 0.0;
+        OP_REQUIRES_OK(context,
+                       ComputeExpectationQsim(pauli_sums[i][j], sim, ss, sv,
+                                              scratch, &exp_v));
+        (*output_tensor)(i, j) = exp_v;
+      }
+    }
+  }
+
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("TfqSimulateExpectationCuda").Device(tensorflow::DEVICE_CPU),
+    TfqSimulateExpectationOpCuda);
+
+REGISTER_OP("TfqSimulateExpectationCuda")
+    .Input("programs: string")
+    .Input("symbol_names: string")
+    .Input("symbol_values: float")
+    .Input("pauli_sums: string")
+    .Output("expectations: float")
+    .SetShapeFn([](tensorflow::shape_inference::InferenceContext* c) {
+      tensorflow::shape_inference::ShapeHandle programs_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &programs_shape));
+
+      tensorflow::shape_inference::ShapeHandle symbol_names_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &symbol_names_shape));
+
+      tensorflow::shape_inference::ShapeHandle symbol_values_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &symbol_values_shape));
+
+      tensorflow::shape_inference::ShapeHandle pauli_sums_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 2, &pauli_sums_shape));
+
+      tensorflow::shape_inference::DimensionHandle output_rows =
+          c->Dim(programs_shape, 0);
+      tensorflow::shape_inference::DimensionHandle output_cols =
+          c->Dim(pauli_sums_shape, 1);
+      c->set_output(0, c->Matrix(output_rows, output_cols));
+
+      return ::tensorflow::Status();
+    });
+
+}  // namespace tfq