-
-
Notifications
You must be signed in to change notification settings - Fork 7.8k
Add GPTQ support #916
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Add GPTQ support #916
Changes from all commits
Commits
Show all changes
27 commits
Select commit
Hold shift + click to select a range
82e6b2e
Add gptq implementation compatible with awq interface
chu-tianxiang 612d7b1
Add more models
chu-tianxiang 049a37c
fix bug in model loading
chu-tianxiang 5563578
Add fallback kernel for desc act models
chu-tianxiang 0470121
Fix engine args and opt model
chu-tianxiang 92c7f8d
Merge main branch
chu-tianxiang f9d0ccc
Add mistral model
chu-tianxiang cbf9433
Fix bug in gpt layer
chu-tianxiang a7b391d
Fix conflict
chu-tianxiang b51ebb7
Merge main branch
chu-tianxiang 9a99461
Fix squeezellm
chu-tianxiang 2593dfe
Use exllama v2 kernels for better performance
chu-tianxiang 97072a7
Add Yi and ChatGLM GPTQ support
chu-tianxiang 2d8dc1d
Fix chatglm
chu-tianxiang 22ea9ce
merge main
chu-tianxiang 17b6f2b
Fix phi model
chu-tianxiang 62bd8ce
move post init to first forward pass to make code cleaner
chu-tianxiang e1c4c25
merge main
chu-tianxiang b6b8c63
Update GPTQ kernel and fix minor problems
chu-tianxiang 1bcb832
Merge main
chu-tianxiang d1954ab
Fix typo
chu-tianxiang 514021c
Merge branch 'main' into gptq_hf
WoosukKwon 62d6760
Minor fix
WoosukKwon 5156579
Minor
WoosukKwon 1f3f6ee
Support Mixtral
WoosukKwon 99cc231
Ignore warning
WoosukKwon 17fcdd2
Fix squeezellm
WoosukKwon File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
/* | ||
Copied from https://github.com/turboderp/exllamav2 | ||
*/ | ||
|
||
#ifndef _compat_cuh | ||
#define _compat_cuh | ||
|
||
namespace vllm { | ||
namespace gptq { | ||
// atomicAdd for half types, to support CC < 7.x | ||
|
||
__device__ __forceinline__ void atomicAdd_half(half* address, half val) | ||
{ | ||
unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2)); | ||
unsigned int old = *address_as_ui; | ||
unsigned int assumed; | ||
|
||
do | ||
{ | ||
assumed = old; | ||
__half_raw hsum; | ||
hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); | ||
half tmpres = __hadd(hsum, val); | ||
hsum = __half_raw(tmpres); | ||
old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x; | ||
old = atomicCAS(address_as_ui, assumed, old); | ||
} | ||
while (assumed != old); | ||
} | ||
|
||
// atomicAdd for half2 types | ||
|
||
__device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val) | ||
{ | ||
unsigned int* address_as_ui = (unsigned int*)address; | ||
unsigned int old = *address_as_ui; | ||
unsigned int assumed; | ||
do | ||
{ | ||
assumed = old; | ||
half2 old_val = *((half2*)&old); | ||
half2 new_val = __hadd2(old_val, val); | ||
old = atomicCAS(address_as_ui, assumed, *((unsigned int*)&new_val)); | ||
} | ||
while (assumed != old); | ||
} | ||
|
||
// | ||
|
||
#if defined(__CUDA_ARCH__) || defined(USE_ROCM) | ||
#if __CUDA_ARCH__ < 700 || defined(USE_ROCM) | ||
|
||
__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); } | ||
|
||
#if __CUDA_ARCH__ < 600 || defined(USE_ROCM) | ||
__device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); } | ||
#endif | ||
|
||
#endif | ||
#endif | ||
|
||
} // namespace gptq | ||
} // namespace vllm | ||
#endif |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
/* | ||
Adapted from https://github.com/turboderp/exllamav2 and https://github.com/turboderp/exllama | ||
*/ | ||
|
||
#ifndef _matrix_view_cuh | ||
#define _matrix_view_cuh | ||
|
||
#include <cuda_runtime.h> | ||
#include <cuda_fp16.h> | ||
|
||
#include "qdq_util.cuh" | ||
|
||
namespace vllm { | ||
namespace gptq { | ||
|
||
class MatrixView_half | ||
{ | ||
public: | ||
const half* data; | ||
const int height; | ||
const int width; | ||
|
||
__device__ __forceinline__ MatrixView_half(const half* data, const int height, const int width) | ||
: data(data), height(height), width(width) | ||
{ } | ||
|
||
__device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; } | ||
__device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; } | ||
__device__ __forceinline__ half2 item_half2half2(int row, int column) const { return __half2half2(data[row * width + column]); } | ||
__device__ __forceinline__ const half* item_ptr(int row, int column) const { return &data[row * width + column]; } | ||
|
||
__device__ __forceinline__ void item4(half (&items)[4], int row, int column) const | ||
{ | ||
half2* ptr = (half2*) item_ptr(row, column); | ||
half2 i01 = ptr[0]; | ||
half2 i23 = ptr[1]; | ||
items[0] = __low2half(i01); | ||
items[1] = __high2half(i01); | ||
items[2] = __low2half(i23); | ||
items[3] = __high2half(i23); | ||
} | ||
__device__ __forceinline__ void item4_f(float (&items)[4], int row, int column) const | ||
{ | ||
half2* ptr = (half2*)item_ptr(row, column); | ||
half2 i01 = ptr[0]; | ||
half2 i23 = ptr[1]; | ||
items[0] = __half2float(__low2half(i01)); | ||
items[1] = __half2float(__high2half(i01)); | ||
items[2] = __half2float(__low2half(i23)); | ||
items[3] = __half2float(__high2half(i23)); | ||
} | ||
|
||
__device__ __forceinline__ void item4_h2(half2 (&items)[4], int row, int column) const | ||
{ | ||
half2* ptr = (half2*)item_ptr(row, column); | ||
half2 i01 = ptr[0]; | ||
half2 i23 = ptr[1]; | ||
items[0] = __half2half2(__low2half(i01)); | ||
items[1] = __half2half2(__high2half(i01)); | ||
items[2] = __half2half2(__low2half(i23)); | ||
items[3] = __half2half2(__high2half(i23)); | ||
} | ||
}; | ||
|
||
class MatrixView_half_rw | ||
{ | ||
public: | ||
half* data; | ||
const int height; | ||
const int width; | ||
|
||
__device__ __forceinline__ MatrixView_half_rw(half* data, const int height, const int width) | ||
: data(data), height(height), width(width) | ||
{ } | ||
|
||
__device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; } | ||
__device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; } | ||
__device__ __forceinline__ half* item_ptr(int row, int column) { return &data[row * width + column]; } | ||
__device__ __forceinline__ void set(int row, int column, half value) { data[row * width + column] = value; } | ||
__device__ __forceinline__ void set_half2(int row, int column, half2 value) { ((half2*)data)[(row * width + column) / 2] = value; } | ||
|
||
__device__ __forceinline__ void set4(int row, int column, half v0, half v1, half v2, half v3) | ||
{ | ||
half2 v01 = __halves2half2(v0, v1); | ||
half2 v23 = __halves2half2(v2, v3); | ||
half2* ptr = (half2*) item_ptr(row, column); | ||
ptr[0] = v01; | ||
ptr[1] = v23; | ||
} | ||
}; | ||
|
||
class MatrixView_q4_row | ||
{ | ||
public: | ||
const uint32_t* data; | ||
const int height; | ||
const int width; | ||
|
||
__device__ __forceinline__ MatrixView_q4_row(const uint32_t* data, const int height, const int width) | ||
: data(data), height(height), width(width) | ||
{ } | ||
|
||
__device__ __forceinline__ int item(int row, int column) const | ||
{ | ||
int shift = (column & 0x07) * 4; | ||
return (data[row * width / 8 + column / 8] >> shift) & 0x0f; | ||
} | ||
|
||
__device__ __forceinline__ void item2(int (&items)[2], int row, int column) const | ||
{ | ||
int shift = (column & 0x07) * 4; | ||
uint32_t d = data[row * width / 8 + column / 8] >> shift; | ||
items[0] = d & 0x0f; | ||
items[1] = (d >> 4) & 0x0f; | ||
} | ||
|
||
__device__ __forceinline__ void item4(int (&items)[4], int row, int column) const | ||
{ | ||
int shift = (column & 0x07) * 4; | ||
uint32_t d = data[row * width / 8 + column / 8] >> shift; | ||
items[0] = d & 0x0f; | ||
items[1] = (d >> 4) & 0x0f; | ||
items[2] = (d >> 8) & 0x0f; | ||
items[3] = (d >> 12) & 0x0f; | ||
} | ||
}; | ||
|
||
class MatrixView_q4_column | ||
{ | ||
public: | ||
const uint32_t* data; | ||
const int height; | ||
const int width; | ||
|
||
__device__ __forceinline__ MatrixView_q4_column(const uint32_t* data, const int height, const int width) | ||
: data(data), height(height), width(width) | ||
{ } | ||
|
||
__device__ __forceinline__ int item(int row, int column) const | ||
{ | ||
int shift = (row & 0x07) * 4; | ||
return (data[row / 8 * width + column] >> shift) & 0x0f; | ||
} | ||
|
||
__device__ __forceinline__ uint32_t item_uint32_t(int row, int column) { return data[row / 8 * width + column]; } | ||
__device__ __forceinline__ const uint32_t* item_uint32_ptr(int row, int column) { return &data[row / 8 * width + column]; } | ||
}; | ||
|
||
} // namespace gptq | ||
} // namespace vllm | ||
#endif |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.