Merge pull request #3 from jaeyoo/pavan/remove_compute_small

jaeyoo · web-flow · commit 79aae25dc5db · 2023-03-28T11:11:09.000-07:00
Remove compute small
diff --git a/.bazelversion b/.bazelversion
@@ -1 +1 @@
-5.1.0
+5.3.0
diff --git a/release/BUILD b/release/BUILD
@@ -1,3 +1,5 @@
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
+
 licenses(["notice"])
 
 sh_binary(
@@ -66,5 +68,8 @@ sh_binary(
         "//tensorflow_quantum/python:util",
         "//tensorflow_quantum/python/optimizers:rotosolve_minimizer",
         "//tensorflow_quantum/python/optimizers:spsa_minimizer",
-    ],
+    ] + if_cuda_is_configured([
+        "//tensorflow_quantum/core/ops:tfq_simulate_ops_cuda_py",
+        "//tensorflow_quantum/core/ops:tfq_simulate_ops_cuquantum_py",
+    ]),
 )
diff --git a/tensorflow_quantum/core/ops/BUILD b/tensorflow_quantum/core/ops/BUILD
@@ -43,14 +43,16 @@ py_library(
         ":tfq_adj_grad_op_py",
         ":tfq_ps_util_ops_py",
         ":tfq_simulate_ops_py",
-        ":tfq_simulate_ops_cuda_py",
         ":tfq_unitary_op_py",
         ":tfq_utility_ops_py",
         # test addons
         "//tensorflow_quantum/core/ops/math_ops:inner_product_op_py",
         "//tensorflow_quantum/core/ops/math_ops:fidelity_op_py",
         "//tensorflow_quantum/core/ops/noise:noisy_expectation_op_py",
-    ],
+    ] + if_cuda_is_configured([
+        ":tfq_simulate_ops_cuda_py",
+        ":tfq_simulate_ops_cuquantum_py",
+    ]),
 )
 
 cc_binary(
diff --git a/tensorflow_quantum/core/ops/tfq_simulate_expectation_op_cuda.cu.cc b/tensorflow_quantum/core/ops/tfq_simulate_expectation_op_cuda.cu.cc
@@ -111,13 +111,8 @@ class TfqSimulateExpectationOpCuda : public tensorflow::OpKernel {
     for (const int num : num_qubits) {
       max_num_qubits = std::max(max_num_qubits, num);
     }
-    if (max_num_qubits >= 26 || programs.size() == 1) {
-      ComputeLarge(num_qubits, fused_circuits, pauli_sums, context,
-                   &output_tensor);
-    } else {
-      ComputeSmall(num_qubits, max_num_qubits, fused_circuits, pauli_sums,
-                   context, &output_tensor);
-    }
+    ComputeLarge(num_qubits, fused_circuits, pauli_sums, context,
+                 &output_tensor);
   }
 
  private:
@@ -175,76 +170,6 @@ class TfqSimulateExpectationOpCuda : public tensorflow::OpKernel {
     }
   }
 
-  void ComputeSmall(
-      const std::vector<int>& num_qubits, const int max_num_qubits,
-      const std::vector<std::vector<qsim::GateFused<QsimGate>>>& fused_circuits,
-      const std::vector<std::vector<PauliSum>>& pauli_sums,
-      tensorflow::OpKernelContext* context,
-      tensorflow::TTypes<float, 1>::Matrix* output_tensor) {
-    using Simulator = qsim::SimulatorCUDA<float>;
-    using StateSpace = Simulator::StateSpace;
-
-    StateSpace::Parameter param_default;
-    const int output_dim_op_size = output_tensor->dimension(1);
-
-    Status compute_status = Status();
-    auto c_lock = tensorflow::mutex();
-    auto DoWork = [&](int start, int end) {
-      int old_batch_index = -2;
-      int cur_batch_index = -1;
-      int largest_nq = 1;
-      int cur_op_index;
-
-      // Begin simulation.
-      auto sim = Simulator();
-      auto ss = StateSpace(param_default);
-      auto sv = ss.Create(largest_nq);
-      auto scratch = ss.Create(largest_nq);
-      for (int i = start; i < end; i++) {
-        cur_batch_index = i / output_dim_op_size;
-        cur_op_index = i % output_dim_op_size;
-
-        const int nq = num_qubits[cur_batch_index];
-
-        // (#679) Just ignore empty program
-        if (fused_circuits[cur_batch_index].size() == 0) {
-          (*output_tensor)(cur_batch_index, cur_op_index) = -2.0;
-          continue;
-        }
-
-        if (cur_batch_index != old_batch_index) {
-          // We've run into a new state vector we must compute.
-          // Only compute a new state vector when we have to.
-          if (nq > largest_nq) {
-            largest_nq = nq;
-            sv = ss.Create(largest_nq);
-            scratch = ss.Create(largest_nq);
-          }
-          // no need to update scratch_state since ComputeExpectation
-          // will take care of things for us.
-          ss.SetStateZero(sv);
-          for (int j = 0; j < fused_circuits[cur_batch_index].size(); j++) {
-            qsim::ApplyFusedGate(sim, fused_circuits[cur_batch_index][j], sv);
-          }
-        }
-
-        float exp_v = 0.0;
-        NESTED_FN_STATUS_SYNC(
-            compute_status,
-            ComputeExpectationQsim(pauli_sums[cur_batch_index][cur_op_index],
-                                   sim, ss, sv, scratch, &exp_v),
-            c_lock);
-        (*output_tensor)(cur_batch_index, cur_op_index) = exp_v;
-        old_batch_index = cur_batch_index;
-      }
-    };
-
-    const int64_t num_cycles =
-        200 * (int64_t(1) << static_cast<int64_t>(max_num_qubits));
-    context->device()->tensorflow_cpu_worker_threads()->workers->ParallelFor(
-        fused_circuits.size() * output_dim_op_size, num_cycles, DoWork);
-    OP_REQUIRES_OK(context, compute_status);
-  }
 };
 
 REGISTER_KERNEL_BUILDER(
@@ -279,4 +204,4 @@ REGISTER_OP("TfqSimulateExpectationCuda")
       return ::tensorflow::Status();
     });
 
-}  // namespace tfq
+}  // namespace tfq
diff --git a/tensorflow_quantum/core/ops/tfq_simulate_expectation_op_cuquantum.cu.cc b/tensorflow_quantum/core/ops/tfq_simulate_expectation_op_cuquantum.cu.cc
@@ -51,6 +51,7 @@ class TfqSimulateExpectationOpCuQuantum : public tensorflow::OpKernel {
       : OpKernel(context) {}
 
   void Compute(tensorflow::OpKernelContext* context) override {
+
     // TODO (mbbrough): add more dimension checks for other inputs here.
     const int num_inputs = context->num_inputs();
     OP_REQUIRES(context, num_inputs == 4,
@@ -116,13 +117,10 @@ class TfqSimulateExpectationOpCuQuantum : public tensorflow::OpKernel {
     // create handles for simulator
     cublasCreate(&cublas_handle_);
     custatevecCreate(&custatevec_handle_);
-    if (max_num_qubits >= 26 || programs.size() == 1) {
-      ComputeLarge(num_qubits, fused_circuits, pauli_sums, context,
-                   &output_tensor); // HOW TO manage extraWorkspace size?
-    } else {
-      ComputeSmall(num_qubits, max_num_qubits, fused_circuits, pauli_sums,
-                   context, &output_tensor);
-    }
+
+    ComputeLarge(num_qubits, fused_circuits, pauli_sums, context,
+                 &output_tensor);
+
     // destroy handles in sync with simulator lifetime
     cublasDestroy(cublas_handle_);
     custatevecDestroy(custatevec_handle_);
@@ -186,76 +184,6 @@ class TfqSimulateExpectationOpCuQuantum : public tensorflow::OpKernel {
       }
     }
   }
-
-  void ComputeSmall(
-      const std::vector<int>& num_qubits, const int max_num_qubits,
-      const std::vector<std::vector<qsim::GateFused<QsimGate>>>& fused_circuits,
-      const std::vector<std::vector<PauliSum>>& pauli_sums,
-      tensorflow::OpKernelContext* context,
-      tensorflow::TTypes<float, 1>::Matrix* output_tensor) {
-    using Simulator = qsim::SimulatorCuStateVec<float>;
-    using StateSpace = Simulator::StateSpace;
-
-    const int output_dim_op_size = output_tensor->dimension(1);
-
-    Status compute_status = Status::OK();
-    auto c_lock = tensorflow::mutex();
-    auto DoWork = [&](int start, int end) {
-      int old_batch_index = -2;
-      int cur_batch_index = -1;
-      int largest_nq = 1;
-      int cur_op_index;
-
-      // Launch custatevec, begin simulation.
-      auto sim = Simulator(cublas_handle_, custatevec_handle_);
-      auto ss = StateSpace(cublas_handle_, custatevec_handle_);
-      auto sv = ss.Create(largest_nq);
-      auto scratch = ss.Create(largest_nq);
-      for (int i = start; i < end; i++) {
-        cur_batch_index = i / output_dim_op_size;
-        cur_op_index = i % output_dim_op_size;
-
-        const int nq = num_qubits[cur_batch_index];
-
-        // (#679) Just ignore empty program
-        if (fused_circuits[cur_batch_index].size() == 0) {
-          (*output_tensor)(cur_batch_index, cur_op_index) = -2.0;
-          continue;
-        }
-
-        if (cur_batch_index != old_batch_index) {
-          // We've run into a new state vector we must compute.
-          // Only compute a new state vector when we have to.
-          if (nq > largest_nq) {
-            largest_nq = nq;
-            sv = ss.Create(largest_nq);
-            scratch = ss.Create(largest_nq);
-          }
-          // no need to update scratch_state since ComputeExpectation
-          // will take care of things for us.
-          ss.SetStateZero(sv);
-          for (int j = 0; j < fused_circuits[cur_batch_index].size(); j++) {
-            qsim::ApplyFusedGate(sim, fused_circuits[cur_batch_index][j], sv);
-          }
-        }
-
-        float exp_v = 0.0;
-        NESTED_FN_STATUS_SYNC(
-            compute_status,
-            ComputeExpectationQsim(pauli_sums[cur_batch_index][cur_op_index],
-                                   sim, ss, sv, scratch, &exp_v),
-            c_lock);
-        (*output_tensor)(cur_batch_index, cur_op_index) = exp_v;
-        old_batch_index = cur_batch_index;
-      }
-    };
-
-    const int64_t num_cycles =
-        200 * (int64_t(1) << static_cast<int64_t>(max_num_qubits));
-    context->device()->tensorflow_cpu_worker_threads()->workers->ParallelFor(
-        fused_circuits.size() * output_dim_op_size, num_cycles, DoWork);
-    OP_REQUIRES_OK(context, compute_status);
-  }
 };
 
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow_quantum/core/ops/tfq_simulate_ops_gpu_test.py b/tensorflow_quantum/core/ops/tfq_simulate_ops_gpu_test.py
@@ -65,19 +65,21 @@ def test_simulate_expectation_cpu_vs_cuda(self):
                 circuit_batch_tensor,
                 symbol_names, symbol_values_array.astype(np.float64),
                 pauli_sums_tensor),
-            "CPU"
+            "CPU",
+            num_samples=100,
         )
 
         cuda_avg_time, res_cuda = measure_average_runtime(
             lambda: tfq_simulate_ops_cuda.tfq_simulate_expectation(
                 circuit_batch_tensor,
                 symbol_names, symbol_values_array.astype(np.float64),
                 pauli_sums_tensor),
-            "CUDA"
+            "CUDA",
+            num_samples=100,
         )
 
         # The result should be the similar within a tolerance.
-        np.testing.assert_allclose(res_cpu, res_cuda, atol=1e-5)
+        np.testing.assert_allclose(res_cpu, res_cuda, atol=1e-4)
 
         # CUDA op should be faster than CPU op.
         self.assertGreater(cpu_avg_time, cuda_avg_time)
@@ -107,19 +109,22 @@ def test_simulate_expectation_cpu_vs_cuquantum(self):
                 circuit_batch_tensor,
                 symbol_names, symbol_values_array.astype(np.float64),
                 pauli_sums_tensor),
-            "CPU"
-        )
+            "CPU",
+            num_samples=100,
 
+        )
+        
         cuda_avg_time, res_cuda = measure_average_runtime(
             lambda: tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
                 circuit_batch_tensor,
                 symbol_names, symbol_values_array.astype(np.float64),
                 pauli_sums_tensor),
-            "cuQuantum"
+            "cuQuantum",
+            num_samples=100,
         )
 
         # The result should be the similar within a tolerance.
-        np.testing.assert_allclose(res_cpu, res_cuda, atol=1e-5)
+        np.testing.assert_allclose(res_cpu, res_cuda, atol=1e-4)
 
         # cuQuantum op should be faster than CPU op.
         self.assertGreater(cpu_avg_time, cuda_avg_time)