Vahe1994
diff --git a/‎.github/workflows/check-style.yaml
Lines changed: 26 additions & 0 deletions b/‎.github/workflows/check-style.yaml
Lines changed: 26 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 126 additions & 1 deletion b/‎README.md
Lines changed: 126 additions & 1 deletion
diff --git a/‎__init__.py b/‎__init__.py
diff --git a/‎aq_engine.py
Lines changed: 216 additions & 0 deletions b/‎aq_engine.py
Lines changed: 216 additions & 0 deletions
diff --git a/‎data/red_pajama_n=1024_2048_context_length.pth
47.9 MB b/‎data/red_pajama_n=1024_2048_context_length.pth
47.9 MB
diff --git a/‎data/red_pajama_n=1024_4096_context_length.pth
83.3 MB b/‎data/red_pajama_n=1024_4096_context_length.pth
83.3 MB
diff --git a/‎data/refined_web_n=128.pth
5.29 MB b/‎data/refined_web_n=128.pth
5.29 MB
@@ -0,0 +1,26 @@
+name: Check style
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+
+jobs:
+  black:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: psf/black@stable
+        with:
+          options: "--check --diff"
+          version: "22.3.0"
+  isort:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v3
+        with:
+          python-version: 3.8
+      - uses: isort/isort-action@master
+        with:
+          isortVersion: "5.10.1"
@@ -1,2 +1,127 @@
 # AQLM
-Official Pytorch repository for Extreme Compression of Large Language Models via Additive Quantization https://arxiv.org/pdf/2401.06118.pdf
+Official Pytorch repository for [Extreme Compression of Large Language Models via Additive Quantization](https://arxiv.org/pdf/2401.06118.pdf)
+
+## Installation
+
+### Packages
+
+Install packages from `requirements.txt`:
+```bash
+pip install -r requirements.txt
+```
+
+### Loading / caching datasets and tokenizer
+
+The script will require downloading and caching locally the relevant tokenizer and the datasets. 
+They will be saved in default Huggingface Datasets directory unless alternative location is provided by env variables.
+See [relevant Datasets documentation section](https://huggingface.co/docs/datasets/main/en/cache#cache-directory)
+## Models
+
+This repository is expected to work with models of `LLaMA ` families so far.
+
+## Data
+
+For quantization with AQLM its is recommended to use the subset of the data model 
+was trained on. I.e. for quantization of `LLaMA 2` models we recommend to use the subset
+of [RedPajama](https://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T-Sample) .The subset of Redpajama for  2048 and 4096 context length stored in `data` directory: 
+* `red_pajama_n=1024_2048_context_length.pth`
+* `red_pajama_n=1024_4096_context_length.pth`
+  
+**Note** These subsets are already processed with the corresponding model tokenizer. Use for different model will lead to
+unexpected behavior.
+
+### W&B logging
+
+For the sake of convenience one can optionally log the data to `Weights and Biases` service (wandb).
+Run `pip install wandb` for W&B logging.
+Specify `$WANDB_ENTITY`, `$WANDB_PROJECT`, `$WANDB_NAME` environment variables prior to running experiments. use `--wandb` argument to enable logging
+# Launching
+
+### GPU and RAM requirements
+This code was developed and tested using a several A100 GPU with 80GB GPU RAM. 
+`--offload activations` option, reduce VRAM usage.
+For `Language Model Evaluation Harness` evaluation one needs to have enough memory to load whole model
+on one or several devices + activation tensors.
+
+### Model downloading
+The code requires the LLaMA model to be downloaded in Huggingface format and saved locally. The scripts below assume that `$TRANSFORMERS_CACHE` variable points to the Huggingface Transformers cache folder.
+
+### Perplexity benchmarks:
+This script compresses the model and then tests its performance in terms of perplexity using WikiText2, C4, and Penn Treebank datasets. 
+
+The command to launch the script should look like this: 
+
+```
+export MODEL_PATH=<PATH_TO_MODEL_DIR>
+export DATASET=<INSERT DATASET NAME OR PATH TO CUSTOM DATA>
+
+python main.py $MODEL_PATH $DATASET \
+    --num_codebooks=2 \
+    --
+    --relative_mse_tolerance=0.01 \
+    --go_relative_mse_tolerance=0.001 \
+    --nsamples=1024 \  
+    --nbits_per_codebook=15 \
+    --in_group_size=8 \
+    --scale_nbits=0 \
+    --local_batch_size=4 \
+    --save="save_path"\
+    --batch_size=32 \
+    --wandb
+```
+
+Note the launch arguments:
+- `<PATH_TO_MODEL_DIR>` - path to model folder, which contains `config.json `
+- `one of [c4, ptb, wikitext2, pajama, refinedweb, none]` -- name of dataset to use for compression, or path to an alternative preprocessed and tokenized dataset.
+- `--num_codebooks` - #Number of codebooks per layer
+- `--batch_size` - Size of sequences fot fine-tuning the layer (GO), globally across all GPUs
+- `--local_batch_size` - Per-device and per-forward-pass batch size used to accumulate global --batch_size
+- `--nsamples` - Number of calibration data samples.If None take all calibration data.
+- `--relative_mse_tolerance`- Stop training when (current_epoch_mse / previous_epoch_mse) > (1 - relative_mse_tolerance)
+- `--in_group_size` - How many input features are quantized together
+- `--nbits_per_codebook` - Codebook size. Each codebook will contain 2 ** nbits_per_codebook vectors
+-  `--scale_nbits` - Number of bits dedicated to the learnable group-wise scale.0 will use row-wise scales
+- `--offload activations` -- moves activations to RAM when not used. Reduces VRAM usage while slowing work by ~10%. 
+run `python main.py --help` for more details on command line arguments, including compression parameters.
+- `--save --load` -- path to save/load quantized model.
+- `--wandb` - log to wandb
+
+### LM Evaluation Harness benchmark.
+
+To perform zero-shot evaluation, we use [Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) framework with slight modifications. This repository contains a copy of LM Evaluation Harness repo from early 2023 in `lm-eval-harness` folder. 
+#### Installation
+Before running the code make sure that you have all the requirements and dependencies of `lm-eval-harness` installed. To install them run:
+```
+pip install -r lm-evaluation-harness/requirements.txt
+```
+#### Execution
+
+The main script launching the evaluation procedure is `lmeval.py` .
+
+
+```
+export MODEL_PATH=<INSERT PATH_TO_MODEL_DIR>
+export DATASET=<INSERT DATASET NAME OR PATH TO CUSTOM DATA>
+
+python lmeval.py \
+    --model hf-causal \
+    --model_args pretrained=$MODEL_PATH,dtype=float16,use_accelerate=True \
+    --load $QUANTZED_MODEL \
+    --tasks winogrande,piqa,hellaswag,arc_easy,arc_challenge \
+    --batch_size 1
+```
+
+## Contributing
+We use black and isort for all pull requests. Before committing your code run black . && isort . .
+
+## Citation
+```
+@misc{egiazarian2024extreme,
+      title={Extreme Compression of Large Language Models via Additive Quantization}, 
+      author={Vage Egiazarian and Andrei Panferov and Denis Kuznedelev and Elias Frantar and Artem Babenko and Dan Alistarh},
+      year={2024},
+      eprint={2401.06118},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG}
+}
+```
@@ -0,0 +1,216 @@
+from __future__ import annotations
+
+import math
+import random
+from argparse import Namespace
+from typing import Optional, Sequence, Union
+
+import torch
+import torch.nn as nn
+from torch.nn.parallel.scatter_gather import Gather
+
+from src.aq import QuantizedWeight
+from src.utils import ellipsis
+
+
+class AQEngine(nn.Module):
+    """A wrapper class that runs AQ training for a single linear layer. All the important math is in aq.py"""
+
+    def __init__(self, layer: nn.Linear, accumultor_dtype: torch.dtype = torch.float64):
+        super().__init__()
+        self.layer = layer
+        self.device = layer.weight.device
+        self.columns = self.layer.weight.data.shape[1]
+        self.register_buffer(
+            "XTX", torch.zeros((self.columns, self.columns), dtype=accumultor_dtype, device=self.device)
+        )
+        self.quantized_weight: Optional[QuantizedWeight] = None
+        self.nsamples = 0
+
+    @torch.no_grad()
+    def add_batch(self, inp: torch.Tensor):
+        """Accumulate a minibatch of layer inputs and update the X.T @ X (aka half hessian)"""
+        assert self.XTX is not None, "Already ran quantization; cannot add more data batches"
+        if len(inp.shape) == 3:
+            inp = inp.reshape((-1, inp.shape[-1]))
+        tmp = inp.shape[0]
+        inp = inp.t()
+
+        self.XTX *= self.nsamples / (self.nsamples + tmp)
+        self.nsamples += tmp
+        inp = math.sqrt(1 / self.nsamples) * inp.to(self.XTX.dtype)
+        self.XTX += inp.matmul(inp.t())
+
+    @torch.enable_grad()
+    def quantize(self, *, args: Namespace, verbose: bool = True) -> QuantizedWeight:
+        """create a QuantizedLinear with specified args based on the collected hessian (XTX) data"""
+        assert isinstance(args.devices, (list, tuple)) and len(args.devices) >= 1, f"Found devices = {args.devices}"
+        assert args.devices[0] == self.device, (args.devices[0], self.XTX.device)
+        self.quantized_weight = QuantizedWeight(
+            XTX=self.XTX.to(device=self.device, dtype=torch.float32),
+            reference_weight=self.layer.weight.detach().to(device=self.device, dtype=torch.float32),
+            out_group_size=args.out_group_size,
+            in_group_size=args.in_group_size,
+            num_codebooks=args.num_codebooks,
+            nbits_per_codebook=args.nbits_per_codebook,
+            codebook_value_nbits=args.codebook_value_nbits,
+            codebook_value_num_groups=args.codebook_value_num_groups,
+            scale_nbits=args.scale_nbits,
+            rrr_rank=args.rrr_rank,
+            max_iter=args.init_max_iter,
+            max_points_per_centroid=args.max_points_per_centroid,
+            devices=args.devices,
+            verbose=True,
+        )
+
+        differentiable_parameters = nn.ParameterDict(
+            {name: param for name, param in self.quantized_weight.named_parameters() if param.requires_grad}
+        )
+        opt = torch.optim.Adam(differentiable_parameters.values(), lr=args.lr, betas=(0.0, 0.95), amsgrad=True)
+
+        replicas = None
+        if len(args.devices) > 1:
+            replicas = torch.nn.parallel.replicate(self, args.devices)
+            replicas[0] = self
+
+        previous_best_loss = float("inf")  # for early stopping
+        for epoch in range(args.max_epochs):
+            # train codebooks and scales
+            for step in range(args.steps_per_epoch):
+                if len(args.devices) == 1:
+                    loss = self._compute_mse()
+                else:
+                    loss = self._compute_mse_parallel(args.devices, replicas, differentiable_parameters)
+
+                if not torch.isfinite(loss).item():
+                    raise ValueError(f"Quantization loss is {loss}")
+                if step == 0 and args.relative_mse_tolerance is not None:
+                    if loss.item() / previous_best_loss > (1.0 - args.relative_mse_tolerance):
+                        return self.quantized_weight  # early stopping; no updates after last epoch's beam search
+                    previous_best_loss = min(previous_best_loss, loss.item())
+
+                opt.zero_grad()
+                loss.backward()
+                opt.step()
+                if verbose and (epoch * args.steps_per_epoch + step) % args.print_frequency == 0:
+                    print(f"epoch={epoch}\tstep={step}\tloss={loss.item():.10f}\t")
+
+            # search for better codes (cluster indices)
+            seed = random.getrandbits(256)
+            self.beam_search_update_codes_(
+                args.devices,
+                replicas,
+                differentiable_parameters,
+                seed=seed,
+                beam_size=args.beam_size,
+                sparsity_regularizer=args.sparsity_regularizer,
+                verbose=True,
+            )
+        return self.quantized_weight
+
+    def _compute_mse(self, selection: Union[slice, ellipsis] = ...) -> torch.Tensor:
+        """
+        Compute the activation MSE error = ||X @ quantized_weight - X @ reference_weight||^2
+        Use the square-of-difference formula to avoid materializing per-batch predictions
+        :param selection:  By default, compute MSE normally. If selection is specified, this method will instead
+            compute MSE over a portion of output channels that align with the selected out_groups (for parallelism)
+            The indices / slices must correspond to output channels (if out_group_size==1) or groups (if > 1).
+            Formally, the indices must be in range [ 0 , self.out_features // self.out_group_size )
+        """
+        assert self.quantized_weight is not None, "must be called inside / after AQUtil.quantize"
+        quantized_weight = self.quantized_weight(selection)
+
+        if isinstance(selection, ellipsis):
+            reference_weight = self.layer.weight.detach().to(quantized_weight.dtype)
+        else:
+            assert isinstance(selection, slice)
+            out_channel_selection = slice(
+                selection.start * self.quantized_weight.out_group_size,
+                selection.stop * self.quantized_weight.out_group_size,
+            )
+
+            reference_weight = self.layer.weight.detach()[out_channel_selection].to(quantized_weight.dtype)
+        delta_weight = (quantized_weight - reference_weight).to(self.XTX.dtype)
+        return (delta_weight @ self.XTX).flatten() @ delta_weight.flatten() / self.quantized_weight.out_features
+
+    def _substitute_and_compute_mse(self, overrides: nn.ParameterDict, selection: slice) -> torch.Tensor:
+        """Utility for parallelism: replace the specified parameters of self.quantized_weight, then compute MSE"""
+        for param_name, param_value in overrides.items():
+            replace_parameter_(self.quantized_weight, param_name, param_value)
+        return self._compute_mse(selection)
+
+    def _compute_mse_parallel(
+        self, devices: Sequence[torch.device], replicas: Sequence[AQEngine], parameters_to_replicate: nn.ParameterDict
+    ) -> torch.Tensor:
+        """Compute MSE in parallel over output channels"""
+        replicated_parameters = torch.nn.parallel.replicate(parameters_to_replicate, devices, detach=False)
+        num_output_groups = self.quantized_weight.out_features // self.quantized_weight.out_group_size
+        shard_size = (num_output_groups - 1) // len(devices) + 1
+        active_slices_by_replica = [
+            slice(i * shard_size, min((i + 1) * shard_size, num_output_groups)) for i in range(len(devices))
+        ]
+        funcs_by_replica = [replica._substitute_and_compute_mse for replica in replicas]
+        inputs_by_replica = [(dict(), active_slices_by_replica[0])]  # no overrides needed for 0-th replica
+        for i in range(1, len(devices)):
+            inputs_by_replica.append((replicated_parameters[i], active_slices_by_replica[i]))
+        mse_components = torch.nn.parallel.parallel_apply(funcs_by_replica, inputs_by_replica, devices=devices)
+        return Gather.apply(devices[0], 0, *(mse.view(1) for mse in mse_components)).sum()
+
+    def _substitute_and_beam_search(self, overrides: nn.ParameterDict, selection: slice, **kwargs) -> torch.Tensor:
+        """Utility for parallelism: replace the specified parameters of self.quantized_weight, then run beam search"""
+        dtype = self.quantized_weight.codebooks.dtype
+        for param_name, param_value in overrides.items():
+            replace_parameter_(self.quantized_weight, param_name, param_value)
+        out_channel_selection = slice(
+            selection.start * self.quantized_weight.out_group_size,
+            selection.stop * self.quantized_weight.out_group_size,
+        )
+        reference_weight = self.layer.weight.detach()[out_channel_selection].to(dtype)
+        return self.quantized_weight.beam_search_update_codes_(
+            self.XTX.to(dtype), reference_weight, selection=selection, **kwargs
+        ).clone()
+
+    @torch.no_grad()
+    def beam_search_update_codes_(
+        self,
+        devices: Sequence[torch.device],
+        replicas: Sequence[AQEngine],
+        parameters_to_replicate: nn.ParameterDict,
+        seed: Optional[int] = None,
+        **kwargs,
+    ):
+        """Update self.quantized_weight.codes in-place via beam search"""
+        if len(devices) == 1:  # single device
+            assert replicas is None
+            dtype = self.quantized_weight.codebooks.dtype
+            self.quantized_weight.beam_search_update_codes_(
+                self.XTX.to(dtype), self.layer.weight.detach().to(dtype), dim_rng=random.Random(seed), **kwargs
+            )
+        else:
+            assert replicas[0] is self
+            replicated_parameters = torch.nn.parallel.replicate(parameters_to_replicate, devices)
+            num_output_groups = self.quantized_weight.out_features // self.quantized_weight.out_group_size
+            shard_size = (num_output_groups - 1) // len(devices) + 1
+            active_slices_by_replica = [
+                slice(i * shard_size, min((i + 1) * shard_size, num_output_groups)) for i in range(len(devices))
+            ]
+
+            funcs_by_replica = [replica._substitute_and_beam_search for replica in replicas]
+            inputs_by_replica = [(dict(), active_slices_by_replica[0])]
+            for i in range(1, len(devices)):
+                inputs_by_replica.append((replicated_parameters[i], active_slices_by_replica[i]))
+            kwargs_by_replica = [dict(kwargs, dim_rng=random.Random(seed)) for _ in range(len(devices))]
+            new_code_parts_by_replica = torch.nn.parallel.parallel_apply(
+                funcs_by_replica, inputs_by_replica, kwargs_by_replica, devices=devices
+            )
+            # gather all code parts and assign them to each replica
+            for device, replica in zip(devices, replicas):
+                replica.quantized_weight.codes[...] = Gather.apply(device, 0, *new_code_parts_by_replica)
+
+
+def replace_parameter_(module: nn.Module, name: str, new_value: torch.Tensor):
+    """A hacky way to substitute an already registered parameter with a non-parameter tensor. Breaks future use."""
+    if name in module._parameters:
+        module._parameters[name] = new_value
+    else:
+        setattr(module, name, new_value)