Skip to content

fix library detection #873

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jan 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 19 additions & 30 deletions bitsandbytes/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,34 +11,18 @@

HEADER_WIDTH = 60

def execute_and_return(command_string: str) -> Tuple[str, str]:
def _decode(subprocess_err_out_tuple):
return tuple(
to_decode.decode("UTF-8").strip()
for to_decode in subprocess_err_out_tuple
)

def execute_and_return_decoded_std_streams(command_string):
return _decode(
subprocess.Popen(
shlex.split(command_string),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
).communicate()
)

std_out, std_err = execute_and_return_decoded_std_streams(command_string)
return std_out, std_err

def find_file_recursive(folder, filename):
folder = shlex.quote(folder)
filename = shlex.quote(filename)
cmd = f'find {folder} -name {filename}'
out, err = execute_and_return(cmd)
if len(err) > 0:
raise RuntimeError('Something when wrong when trying to find file. Maybe you do not have a linux system?')
import glob
outs = []
try:
for ext in ["so", "dll", "dylib"]:
out = glob.glob(os.path.join(folder, "**", filename + ext))
outs.extend(out)
except Exception as e:
raise RuntimeError('Error: Something when wrong when trying to find file. {e}')
Copy link
Contributor

@akx akx Jan 29, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is missing an f so the exception is never printed. (That's fixed in #984.)


return out
return outs
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does the list instead of path cause errors in some edge cases? As I understand it, we either have dll or dylib, is that correct? Or can we have both in some cases?



def generate_bug_report_information():
Expand All @@ -48,30 +32,35 @@ def generate_bug_report_information():
print('')

if 'CONDA_PREFIX' in os.environ:
paths = find_file_recursive(os.environ['CONDA_PREFIX'], '*cuda*so')
paths = find_file_recursive(os.environ['CONDA_PREFIX'], '*cuda*')
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to make sure that this does not match non-binary files. How can we make sure that this is correct in most environments?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dll, so, or dylib ext part appended later in the find_file_recursive()

print_header("ANACONDA CUDA PATHS")
print(paths)
print('')
if isdir('/usr/local/'):
paths = find_file_recursive('/usr/local', '*cuda*so')
paths = find_file_recursive('/usr/local', '*cuda*')
print_header("/usr/local CUDA PATHS")
print(paths)
print('')
if 'CUDA_PATH' in os.environ and isdir(os.environ['CUDA_PATH']):
paths = find_file_recursive(os.environ['CUDA_PATH'], '*cuda*')
print_header("CUDA PATHS")
print(paths)
print('')

if isdir(os.getcwd()):
paths = find_file_recursive(os.getcwd(), '*cuda*so')
paths = find_file_recursive(os.getcwd(), '*cuda*')
print_header("WORKING DIRECTORY CUDA PATHS")
print(paths)
print('')

print_header("LD_LIBRARY CUDA PATHS")
if 'LD_LIBRARY_PATH' in os.environ:
lib_path = os.environ['LD_LIBRARY_PATH'].strip()
for path in set(lib_path.split(':')):
for path in set(lib_path.split(os.pathsep)):
try:
if isdir(path):
print_header(f"{path} CUDA PATHS")
paths = find_file_recursive(path, '*cuda*so')
paths = find_file_recursive(path, '*cuda*')
print(paths)
except:
print(f'Could not read LD_LIBRARY_PATH: {path}')
Expand Down
2 changes: 1 addition & 1 deletion bitsandbytes/cuda_setup/env_vars.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def to_be_ignored(env_var: str, value: str) -> bool:


def might_contain_a_path(candidate: str) -> bool:
return "/" in candidate
return os.sep in candidate


def is_active_conda_env(env_var: str) -> bool:
Expand Down
33 changes: 22 additions & 11 deletions bitsandbytes/cuda_setup/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import ctypes as ct
import os
import errno
import platform
import torch
from warnings import warn
from itertools import product
Expand All @@ -31,7 +32,11 @@
# libcudart.so is missing by default for a conda install with PyTorch 2.0 and instead
# we have libcudart.so.11.0 which causes a lot of errors before
# not sure if libcudart.so.12.0 exists in pytorch installs, but it does not hurt
CUDA_RUNTIME_LIBS: list = ["libcudart.so", 'libcudart.so.11.0', 'libcudart.so.12.0', 'libcudart.so.12.1', 'libcudart.so.12.2']
system = platform.system()
if system == 'Windows':
CUDA_RUNTIME_LIBS: list = ["nvcuda.dll"]
else: # Linux or other
CUDA_RUNTIME_LIBS: list = ["libcudart.so", 'libcudart.so.11.0', 'libcudart.so.12.0', 'libcudart.so.12.1', 'libcudart.so.12.2']

# this is a order list of backup paths to search CUDA in, if it cannot be found in the main environmental paths
backup_paths = []
Expand Down Expand Up @@ -114,7 +119,9 @@ def manual_override(self):
'For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<path_to_cuda_dir/lib64\n'
f'Loading CUDA version: BNB_CUDA_VERSION={os.environ["BNB_CUDA_VERSION"]}'
f'\n{"="*80}\n\n'))
self.binary_name = self.binary_name[:-6] + f'{os.environ["BNB_CUDA_VERSION"]}.so'
binary_name = self.binary_name.rsplit(".", 1)[0]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a bit of a mess (based on my messy previous code). I have not looked at the details. Is there a way where we can clean this up slightly?

suffix = ".so" if os.name != "nt" else ".dll"
self.binary_name = binary_name[:-3] + f'{os.environ["BNB_CUDA_VERSION"]}.{suffix}'

def run_cuda_setup(self):
self.initialized = True
Expand All @@ -131,10 +138,11 @@ def run_cuda_setup(self):
package_dir = Path(__file__).parent.parent
binary_path = package_dir / self.binary_name

suffix = ".so" if os.name != "nt" else ".dll"
try:
if not binary_path.exists():
self.add_log_entry(f"CUDA SETUP: Required library version not found: {binary_name}. Maybe you need to compile it from source?")
legacy_binary_name = "libbitsandbytes_cpu.so"
legacy_binary_name = f"libbitsandbytes_cpu{suffix}"
self.add_log_entry(f"CUDA SETUP: Defaulting to {legacy_binary_name}...")
binary_path = package_dir / legacy_binary_name
if not binary_path.exists() or torch.cuda.is_available():
Expand All @@ -153,10 +161,10 @@ def run_cuda_setup(self):
self.add_log_entry('')
self.generate_instructions()
raise Exception('CUDA SETUP: Setup Failed!')
self.lib = ct.cdll.LoadLibrary(binary_path)
self.lib = ct.cdll.LoadLibrary(str(binary_path))
else:
self.add_log_entry(f"CUDA SETUP: Loading binary {binary_path}...")
self.lib = ct.cdll.LoadLibrary(binary_path)
self.add_log_entry(f"CUDA SETUP: Loading binary {binary_path!s}...")
self.lib = ct.cdll.LoadLibrary(str(binary_path))
except Exception as ex:
self.add_log_entry(str(ex))

Expand Down Expand Up @@ -190,7 +198,7 @@ def is_cublasLt_compatible(cc):
return has_cublaslt

def extract_candidate_paths(paths_list_candidate: str) -> Set[Path]:
return {Path(ld_path) for ld_path in paths_list_candidate.split(":") if ld_path}
return {Path(ld_path) for ld_path in paths_list_candidate.split(os.pathsep) if ld_path}


def remove_non_existent_dirs(candidate_paths: Set[Path]) -> Set[Path]:
Expand Down Expand Up @@ -336,13 +344,14 @@ def get_compute_capabilities():

def evaluate_cuda_setup():
cuda_setup = CUDASetup.get_instance()
suffix = ".so" if os.name != "nt" else ".dll"
if 'BITSANDBYTES_NOWELCOME' not in os.environ or str(os.environ['BITSANDBYTES_NOWELCOME']) == '0':
cuda_setup.add_log_entry('')
cuda_setup.add_log_entry('='*35 + 'BUG REPORT' + '='*35)
cuda_setup.add_log_entry(('Welcome to bitsandbytes. For bug reports, please run\n\npython -m bitsandbytes\n\n'),
('and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues'))
cuda_setup.add_log_entry('='*80)
if not torch.cuda.is_available(): return 'libbitsandbytes_cpu.so', None, None, None
if not torch.cuda.is_available(): return f'libbitsandbytes_cpu{suffix}', None, None, None

cudart_path = determine_cuda_runtime_lib_path()
ccs = get_compute_capabilities()
Expand All @@ -366,9 +375,11 @@ def evaluate_cuda_setup():
# since most installations will have the libcudart.so installed, but not the compiler

if has_cublaslt:
binary_name = f"libbitsandbytes_cuda{cuda_version_string}.so"
binary_name = f"libbitsandbytes_cuda{cuda_version_string}"
else:
"if not has_cublaslt (CC < 7.5), then we have to choose _nocublaslt.so"
binary_name = f"libbitsandbytes_cuda{cuda_version_string}_nocublaslt.so"
"if not has_cublaslt (CC < 7.5), then we have to choose _nocublaslt"
binary_name = f"libbitsandbytes_cuda{cuda_version_string}_nocublaslt"

binary_name = f"{binary_name}{suffix}"

return binary_name, cudart_path, cc, cuda_version_string
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@


libs = list(glob.glob("./bitsandbytes/libbitsandbytes*.so"))
libs += list(glob.glob("./bitsandbytes/libbitsandbytes*.dll"))
libs = [os.path.basename(p) for p in libs]
print("libs:", libs)

Expand Down