pytorch
diff --git a/‎.ci/pytorch/common-build.sh
Lines changed: 5 additions & 4 deletions b/‎.ci/pytorch/common-build.sh
Lines changed: 5 additions & 4 deletions
diff --git a/‎.ci/pytorch/test.sh
Lines changed: 23 additions & 16 deletions b/‎.ci/pytorch/test.sh
Lines changed: 23 additions & 16 deletions
diff --git a/‎.github/ci_commit_pins/torchbench.txt
Lines changed: 1 addition & 1 deletion b/‎.github/ci_commit_pins/torchbench.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/inductor-perf-max-autotune-weekly.yml
Lines changed: 45 additions & 0 deletions b/‎.github/workflows/inductor-perf-max-autotune-weekly.yml
Lines changed: 45 additions & 0 deletions
diff --git a/‎.github/workflows/inductor-perf-test-nightly.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/inductor-perf-test-nightly.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/periodic.yml
Lines changed: 12 additions & 0 deletions b/‎.github/workflows/periodic.yml
Lines changed: 12 additions & 0 deletions
diff --git a/‎.github/workflows/trunk.yml
Lines changed: 0 additions & 12 deletions b/‎.github/workflows/trunk.yml
Lines changed: 0 additions & 12 deletions
diff --git a/‎.lintrunner.toml
Lines changed: 3 additions & 2 deletions b/‎.lintrunner.toml
Lines changed: 3 additions & 2 deletions
diff --git a/‎aten/src/ATen/CPUGeneratorImpl.cpp
Lines changed: 15 additions & 0 deletions b/‎aten/src/ATen/CPUGeneratorImpl.cpp
Lines changed: 15 additions & 0 deletions
diff --git a/‎aten/src/ATen/CPUGeneratorImpl.h
Lines changed: 2 additions & 0 deletions b/‎aten/src/ATen/CPUGeneratorImpl.h
Lines changed: 2 additions & 0 deletions
diff --git a/‎aten/src/ATen/TensorIterator.h
Lines changed: 17 additions & 0 deletions b/‎aten/src/ATen/TensorIterator.h
Lines changed: 17 additions & 0 deletions
diff --git a/‎aten/src/ATen/core/Generator.h
Lines changed: 7 additions & 0 deletions b/‎aten/src/ATen/core/Generator.h
Lines changed: 7 additions & 0 deletions
diff --git a/‎aten/src/ATen/core/PhiloxRNGEngine.h
Lines changed: 17 additions & 0 deletions b/‎aten/src/ATen/core/PhiloxRNGEngine.h
Lines changed: 17 additions & 0 deletions
diff --git a/‎aten/src/ATen/cuda/CUDABlas.cpp
Lines changed: 32 additions & 0 deletions b/‎aten/src/ATen/cuda/CUDABlas.cpp
Lines changed: 32 additions & 0 deletions
diff --git a/‎aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
Lines changed: 21 additions & 0 deletions b/‎aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
Lines changed: 21 additions & 0 deletions
diff --git a/‎aten/src/ATen/cuda/CUDAGeneratorImpl.h
Lines changed: 2 additions & 0 deletions b/‎aten/src/ATen/cuda/CUDAGeneratorImpl.h
Lines changed: 2 additions & 0 deletions
diff --git a/‎aten/src/ATen/mps/MPSGeneratorImpl.h
Lines changed: 2 additions & 0 deletions b/‎aten/src/ATen/mps/MPSGeneratorImpl.h
Lines changed: 2 additions & 0 deletions
@@ -31,19 +31,20 @@ if [[ "$BUILD_ENVIRONMENT" != *win-* ]]; then
             # as though sccache still gets used even when the sscache server isn't started
             # explicitly
             echo "Skipping sccache server initialization, setting environment variables"
-            export SCCACHE_IDLE_TIMEOUT=1200
+            export SCCACHE_IDLE_TIMEOUT=0
             export SCCACHE_ERROR_LOG=~/sccache_error.log
             export RUST_LOG=sccache::server=error
         elif [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
             SCCACHE_ERROR_LOG=~/sccache_error.log SCCACHE_IDLE_TIMEOUT=0 sccache --start-server
         else
             # increasing SCCACHE_IDLE_TIMEOUT so that extension_backend_test.cpp can build after this PR:
             # https://github.com/pytorch/pytorch/pull/16645
-            SCCACHE_ERROR_LOG=~/sccache_error.log SCCACHE_IDLE_TIMEOUT=1200 RUST_LOG=sccache::server=error sccache --start-server
+            SCCACHE_ERROR_LOG=~/sccache_error.log SCCACHE_IDLE_TIMEOUT=0 RUST_LOG=sccache::server=error sccache --start-server
         fi
 
-        # Report sccache stats for easier debugging
-        sccache --zero-stats
+        # Report sccache stats for easier debugging. It's ok if this commands
+        # timeouts and fails on MacOS
+        sccache --zero-stats || true
     fi
 
     if which ccache > /dev/null; then
 
@@ -278,6 +278,10 @@ else
   DYNAMO_BENCHMARK_FLAGS+=(--device cuda)
 fi
 
+if [[ "${TEST_CONFIG}" == *max_autotune* ]]; then
+  export TORCHINDUCTOR_MAX_AUTOTUNE=1
+fi
+
 test_perf_for_dashboard() {
   TEST_REPORTS_DIR=$(pwd)/test/test-reports
   mkdir -p "$TEST_REPORTS_DIR"
@@ -292,30 +296,33 @@ test_perf_for_dashboard() {
     # Run accuracy test for inductor with different configs
     # --disable-cudagraphs is the default inductor behavior
     # TODO: update here once cudagraphs is turned on as default
-    python "benchmarks/dynamo/$suite.py" \
-        --accuracy --"$mode" --"$dtype" --backend "$backend" --disable-cudagraphs "$@" \
-        --output "$TEST_REPORTS_DIR/${backend}_no_cudagraphs_${suite}_${dtype}_${mode}_cuda_accuracy.csv"
+    if [[ "${TEST_CONFIG}" != *max_autotune* ]]; then
+      python "benchmarks/dynamo/$suite.py" \
+          --accuracy --"$mode" --"$dtype" --backend "$backend" --disable-cudagraphs "$@" \
+          --output "$TEST_REPORTS_DIR/${backend}_no_cudagraphs_${suite}_${dtype}_${mode}_cuda_accuracy.csv"
+      python "benchmarks/dynamo/$suite.py" \
+          --accuracy --"$mode" --"$dtype" --backend "$backend" --dynamic-shapes --dynamic-batch-only --disable-cudagraphs "$@" \
+          --output "$TEST_REPORTS_DIR/${backend}_dynamic_${suite}_${dtype}_${mode}_cuda_accuracy.csv"
+    fi
+    # Only test this one config for max-autotune
     python "benchmarks/dynamo/$suite.py" \
         --accuracy --"$mode" --"$dtype" --backend "$backend" "$@" \
         --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_cuda_accuracy.csv"
-    python "benchmarks/dynamo/$suite.py" \
-        --accuracy --"$mode" --"$dtype" --backend "$backend" --dynamic-shapes --dynamic-batch-only --disable-cudagraphs "$@" \
-        --output "$TEST_REPORTS_DIR/${backend}_dynamic_${suite}_${dtype}_${mode}_cuda_accuracy.csv"
 
     # Run performance test
-    # Skip dynamo-eager and aot-eager for performance test
-    # Run performance test for inductor with different configs
-    # TODO: add more configs here, e.g. max-autotune, etc.
-    python "benchmarks/dynamo/$suite.py" \
-        --performance --cold-start-latency --"$mode" --"$dtype" --backend "$backend" --disable-cudagraphs "$@" \
-        --output "$TEST_REPORTS_DIR/${backend}_no_cudagraphs_${suite}_${dtype}_${mode}_cuda_performance.csv"
+    if [[ "${TEST_CONFIG}" != *max_autotune* ]]; then
+      python "benchmarks/dynamo/$suite.py" \
+          --performance --cold-start-latency --"$mode" --"$dtype" --backend "$backend" --disable-cudagraphs "$@" \
+          --output "$TEST_REPORTS_DIR/${backend}_no_cudagraphs_${suite}_${dtype}_${mode}_cuda_performance.csv"
+      python "benchmarks/dynamo/$suite.py" \
+          --performance --cold-start-latency --"$mode" --"$dtype" --backend "$backend" --dynamic-shapes \
+          --dynamic-batch-only --disable-cudagraphs "$@" \
+          --output "$TEST_REPORTS_DIR/${backend}_dynamic_${suite}_${dtype}_${mode}_cuda_performance.csv"
+    fi
+    # Only test this one config for max-autotune
     python "benchmarks/dynamo/$suite.py" \
         --performance --cold-start-latency --"$mode" --"$dtype" --backend "$backend" "$@" \
         --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_cuda_performance.csv"
-    python "benchmarks/dynamo/$suite.py" \
-        --performance --cold-start-latency --"$mode" --"$dtype" --backend "$backend" --dynamic-shapes \
-        --dynamic-batch-only --disable-cudagraphs "$@" \
-        --output "$TEST_REPORTS_DIR/${backend}_dynamic_${suite}_${dtype}_${mode}_cuda_performance.csv"
   done
 }
 
 
@@ -1 +1 @@
-159e58f0b36ee22e2b89d74bd7dc8a79376de01d
+a0848e19bad26ed92810b56616e93dbec0eeaa24
@@ -0,0 +1,45 @@
+name: inductor-A100-max-autotune-weekly
+
+on:
+  schedule:
+    - cron: 0 0 * * 0
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+jobs:
+  linux-bionic-cuda11_8-py3_10-gcc7-inductor-build:
+    name: cuda11.8-py3.10-gcc7-sm80
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-bionic-cuda11.8-py3.10-gcc7-sm80
+      docker-image-name: pytorch-linux-bionic-cuda11.8-cudnn8-py3-gcc7
+      cuda-arch-list: '8.0'
+      test-matrix: |
+        { include: [
+          { config: "inductor_huggingface_perf_max_autotune", shard: 1, num_shards: 3, runner: "linux.gcp.a100.large" },
+          { config: "inductor_huggingface_perf_max_autotune", shard: 2, num_shards: 3, runner: "linux.gcp.a100.large" },
+          { config: "inductor_huggingface_perf_max_autotune", shard: 3, num_shards: 3, runner: "linux.gcp.a100.large" },
+          { config: "inductor_timm_perf_max_autotune", shard: 1, num_shards: 6, runner: "linux.gcp.a100.large" },
+          { config: "inductor_timm_perf_max_autotune", shard: 2, num_shards: 6, runner: "linux.gcp.a100.large" },
+          { config: "inductor_timm_perf_max_autotune", shard: 3, num_shards: 6, runner: "linux.gcp.a100.large" },
+          { config: "inductor_timm_perf_max_autotune", shard: 4, num_shards: 6, runner: "linux.gcp.a100.large" },
+          { config: "inductor_timm_perf_max_autotune", shard: 5, num_shards: 6, runner: "linux.gcp.a100.large" },
+          { config: "inductor_timm_perf_max_autotune", shard: 6, num_shards: 6, runner: "linux.gcp.a100.large" },
+          { config: "inductor_torchbench_perf_max_autotune", shard: 1, num_shards: 3, runner: "linux.gcp.a100.large" },
+          { config: "inductor_torchbench_perf_max_autotune", shard: 2, num_shards: 3, runner: "linux.gcp.a100.large" },
+          { config: "inductor_torchbench_perf_max_autotune", shard: 3, num_shards: 3, runner: "linux.gcp.a100.large" },
+        ]}
+
+  linux-bionic-cuda11_8-py3_10-gcc7-inductor-test:
+    name: cuda11.8-py3.10-gcc7-sm80
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-bionic-cuda11_8-py3_10-gcc7-inductor-build
+    with:
+      build-environment: linux-bionic-cuda11.8-py3.10-gcc7-sm80
+      docker-image: ${{ needs.linux-bionic-cuda11_8-py3_10-gcc7-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-bionic-cuda11_8-py3_10-gcc7-inductor-build.outputs.test-matrix }}
+      use-gha: anything-non-empty-to-use-gha
+      timeout-minutes: 720
@@ -2,7 +2,7 @@ name: inductor-A100-perf-nightly
 
 on:
   schedule:
-    - cron: 45 1,13 * * *
+    - cron: 45 1,13 * * 1-6
   workflow_dispatch:
 
 concurrency:
 
@@ -138,6 +138,18 @@ jobs:
       cuda-version: "11.8"
       test-matrix: ${{ needs.win-vs2019-cuda11_8-py3-build.outputs.test-matrix }}
 
+  ios-12-5-1-x86-64:
+    name: ios-12-5-1-x86-64
+    uses: ./.github/workflows/_ios-build-test.yml
+    with:
+      build-environment: ios-12-5-1-x86-64
+      ios-platform: SIMULATOR
+      ios-arch: x86_64
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 1, runner: "macos-12" },
+        ]}
+
   ios-12-5-1-x86-64-coreml:
     name: ios-12-5-1-x86-64-coreml
     uses: ./.github/workflows/_ios-build-test.yml
 
@@ -87,18 +87,6 @@ jobs:
           { config: "default", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
         ]}
 
-  ios-12-5-1-x86-64:
-    name: ios-12-5-1-x86-64
-    uses: ./.github/workflows/_ios-build-test.yml
-    with:
-      build-environment: ios-12-5-1-x86-64
-      ios-platform: SIMULATOR
-      ios-arch: x86_64
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 1, runner: "macos-12" },
-        ]}
-
   macos-12-py3-arm64-build:
     name: macos-12-py3-arm64
     uses: ./.github/workflows/_mac-build.yml
 
@@ -623,6 +623,7 @@ include_patterns = [
 exclude_patterns = [
     'aten/src/ATen/test/**',
     'c10/cuda/CUDAFunctions.h',
+    'c10/cuda/CUDACachingAllocator.cpp',
 ]
 command = [
     'python3',
@@ -657,8 +658,8 @@ exclude_patterns = [
 command = [
     'python3',
     'tools/linter/adapters/grep_linter.py',
-    '--pattern=cudaSetDevice',
-    '--pattern=cudaGetDevice',
+    '--pattern=cudaSetDevice(',
+    '--pattern=cudaGetDevice(',
     '--linter-name=RAWCUDADEVICE',
     '--error-name=raw CUDA API usage',
     """--error-description=\
 
@@ -94,6 +94,21 @@ void CPUGeneratorImpl::set_current_seed(uint64_t seed) {
   engine_ = mt19937(seed);
 }
 
+/**
+ * Sets the offset of RNG state.
+ * See Note [Acquire lock when using random generators]
+ */
+void CPUGeneratorImpl::set_offset(uint64_t offset) {
+  TORCH_CHECK(false, "CPU Generator does not use offset");
+}
+
+/**
+ * Gets the current offset of CPUGeneratorImpl.
+ */
+uint64_t CPUGeneratorImpl::get_offset() const {
+  TORCH_CHECK(false, "CPU Generator does not use offset");
+}
+
 /**
  * Gets the current seed of CPUGeneratorImpl.
  */
 
@@ -15,6 +15,8 @@ struct TORCH_API CPUGeneratorImpl : public c10::GeneratorImpl {
   // CPUGeneratorImpl methods
   std::shared_ptr<CPUGeneratorImpl> clone() const;
   void set_current_seed(uint64_t seed) override;
+  void set_offset(uint64_t offset) override;
+  uint64_t get_offset() const override;
   uint64_t current_seed() const override;
   uint64_t seed() override;
   void set_state(const c10::TensorImpl& new_state) override;
 
@@ -372,6 +372,23 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
     return c10::fetch_and_cast<T>(op.tensor_base().scalar_type(), op.data);
   }
 
+  /// Return scalar value from original_tensor_base if it is defined. When
+  /// common_dtype is Half, casting scalar input to common_dtype might overflow.
+  /// If the scalar is aleady given in the type of Half, then return scalar
+  /// value from tensor_base.
+  template <typename T>
+  T original_scalar_value(int arg) {
+    auto& original_tensor_base = operands_[arg].original_tensor_base();
+    if (original_tensor_base.defined()) {
+      TORCH_INTERNAL_ASSERT(
+          original_tensor_base.scalar_type() != common_dtype());
+      return c10::fetch_and_cast<T>(
+          original_tensor_base.scalar_type(), original_tensor_base.data_ptr());
+    } else {
+      return scalar_value<T>(arg);
+    }
+  }
+
  private:
   template <typename loop1d_t>
   auto loop_2d_from_1d(const loop1d_t& loop) {
 
@@ -93,6 +93,13 @@ struct TORCH_API Generator {
   }
 
   void set_current_seed(uint64_t seed) { impl_->set_current_seed(seed); }
+  // Sets the offset of Generator state to the desired offset. This is currently
+  // supported for only Philox based Generators, i.e., CUDA and MPS.
+  void set_offset(uint64_t offset) { impl_->set_offset(offset); }
+
+  // Returns the offset of Generator state. This is currently supported for only
+  // Philox based Generators, i.e., CUDA and MPS.
+  uint64_t get_offset() const { return impl_->get_offset(); }
 
   uint64_t current_seed() const { return impl_->current_seed(); }
 
 
@@ -86,6 +86,23 @@ class philox_engine {
     STATE = 0;
   }
 
+  /**
+   * Set the offset field of Philox Generator to the desired offset.
+   */
+  C10_HOST_DEVICE inline void set_offset(uint64_t offset) {
+    counter_[0] = static_cast<uint32_t>(offset);
+    counter_[1] = static_cast<uint32_t>(offset >> 32);
+  }
+
+  /**
+   * Gets the current offset of the Philox Generator.
+   */
+  C10_HOST_DEVICE uint64_t get_offset() const {
+    uint64_t lo = static_cast<uint64_t>(counter_[0]);
+    uint64_t hi = static_cast<uint64_t>(counter_[1]) << 32;
+    return lo | hi;
+  }
+
   /**
    * Produces a unique 32-bit pseudo random number on every invocation. Bookeeps state to avoid waste.
    */
 
@@ -104,6 +104,17 @@ static void _cublasAdjustLdLevel3(
       *ldb = std::max<int64_t>(k, 1);
   }
 }
+
+uint32_t _getAlignment(uintptr_t address) {
+  // alignment are in bytes
+  uint32_t alignment = 256;
+  for (; ; alignment /= 2) {
+    if (!(address % alignment)) {
+      return alignment;
+    }
+  }
+}
+
 } // anonymous namespace
 
 namespace at {
@@ -703,6 +714,27 @@ void gemm_and_bias(
       &workspaceSize,
       sizeof(workspaceSize)));
 
+  uint32_t a_alignment = _getAlignment(reinterpret_cast<uintptr_t>(mat1_ptr));
+  uint32_t b_alignment = _getAlignment(reinterpret_cast<uintptr_t>(mat2_ptr));
+  uint32_t c_alignment = _getAlignment(reinterpret_cast<uintptr_t>(result_ptr));
+  uint32_t d_alignment = _getAlignment(reinterpret_cast<uintptr_t>(bias));
+  TORCH_CUDABLAS_CHECK(cublasLtMatmulPreferenceSetAttribute(
+      preference.descriptor(),
+      CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_A_BYTES,
+      &a_alignment, sizeof(a_alignment)));
+  TORCH_CUDABLAS_CHECK(cublasLtMatmulPreferenceSetAttribute(
+      preference.descriptor(),
+      CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_B_BYTES,
+      &b_alignment, sizeof(b_alignment)));
+  TORCH_CUDABLAS_CHECK(cublasLtMatmulPreferenceSetAttribute(
+      preference.descriptor(),
+      CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_C_BYTES,
+      &c_alignment, sizeof(c_alignment)));
+  TORCH_CUDABLAS_CHECK(cublasLtMatmulPreferenceSetAttribute(
+      preference.descriptor(),
+      CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_D_BYTES,
+      &d_alignment, sizeof(d_alignment)));
+
   auto workspace = at::empty(
       {static_cast<int64_t>(workspaceSize)},
       at::device({at::kCUDA, at::cuda::current_device()}).dtype(at::kByte));
 
@@ -117,6 +117,27 @@ void CUDAGeneratorImpl::set_current_seed(uint64_t seed) {
   no_reset_rnn_state_.clear();
 }
 
+/**
+ * Sets the offset to be used by curandStatePhilox4_32_10
+ *
+ * See Note [Acquire lock when using random generators]
+ */
+void CUDAGeneratorImpl::set_offset(uint64_t offset) {
+  at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::set_offset");
+  philox_offset_per_thread_ = offset;
+  no_reset_rnn_state_.clear();
+}
+
+/**
+ * Gets the current offset of CUDAGeneratorImpl.
+ */
+uint64_t CUDAGeneratorImpl::get_offset() const {
+  // Debatable if get_offset() should be allowed in captured regions.
+  // Conservatively disallow it for now.
+  at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::get_offset");
+  return philox_offset_per_thread_;
+}
+
 #define CAPTURE_DEFAULT_GENS_MSG \
 "In regions captured by CUDA graphs, you may only use the default CUDA RNG " \
 "generator on the device that's current when capture begins. " \
 
@@ -95,6 +95,8 @@ struct TORCH_CUDA_CPP_API CUDAGeneratorImpl : public c10::GeneratorImpl {
   // CUDAGeneratorImpl methods
   std::shared_ptr<CUDAGeneratorImpl> clone() const;
   void set_current_seed(uint64_t seed) override;
+  void set_offset(uint64_t offset) override;
+  uint64_t get_offset() const override;
   uint64_t current_seed() const override;
   uint64_t seed() override;
   void set_state(const c10::TensorImpl& new_state) override;
 
@@ -31,6 +31,8 @@ struct TORCH_API MPSGeneratorImpl : public c10::GeneratorImpl {
   // MPSGeneratorImpl methods
   std::shared_ptr<MPSGeneratorImpl> clone() const;
   void set_current_seed(uint64_t seed) override;
+  void set_offset(uint64_t offset) override;
+  uint64_t get_offset() const override;
   uint64_t current_seed() const override;
   uint64_t seed() override;
   void set_state(const c10::TensorImpl& new_state) override;
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-159e58f0b36ee22e2b89d74bd7dc8a79376de01d`
	`1`	`+a0848e19bad26ed92810b56616e93dbec0eeaa24`