generated from Lightning-AI/deep-learning-project-template
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtrain.bash
40 lines (33 loc) · 1.15 KB
/
train.bash
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#!/bin/bash
####################### BSUB Headers #########################
#BSUB -J train_lit_image_classifier_model_with_pl
#BSUB -P bif132
#BSUB -W 0:30
#BSUB -nnodes 32
#BSUB -q batch
#BSUB -alloc_flags "gpumps"
#BSUB -o job%J.out
#BSUB -e job%J.out
###############################################################
# Remote Conda environment
module load open-ce/1.2.0-py38-0
conda activate DLHPT
# Remote project path
export PROJDIR="$PWD"
# Set number of threads for OMP
export OMP_PLACES=threads
export OMP_NUM_THREADS=7
# Configure proxy access on compute nodes
export WANDB_INSECURE_DISABLE_SSL=true
export all_proxy=socks://proxy.ccs.ornl.gov:3128/
export ftp_proxy=ftp://proxy.ccs.ornl.gov:3128/
export http_proxy=http://proxy.ccs.ornl.gov:3128/
export https_proxy=http://proxy.ccs.ornl.gov:3128/
export no_proxy='localhost,127.0.0.0/8,.ccs.ornl.gov,.ncrc.gov'
export LC_ALL=en_US.utf8
# Run training script
cd "$PROJDIR"/project || exit
# Execute script
date
jsrun -g6 -a6 -c42 -r1 python3 lit_image_classifier.py --logger_name WandB --num_gpus 6 --num_compute_nodes 32 --num_epochs 50 --batch_size 64 --hidden_dim 512 --lr 1e-4 --num_dataloader_workers 28
date