Skip to content
This repository was archived by the owner on Mar 29, 2024. It is now read-only.

Commit 2a797df

Browse files
committed
add job generation
1 parent 9eabcf6 commit 2a797df

File tree

4 files changed

+149
-2
lines changed

4 files changed

+149
-2
lines changed

Dockerfile

+22-2
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,31 @@ RUN pip install git+https://github.com/huggingface/transformers.git
1818

1919
RUN mkdir -p /workspace/wav2vec/
2020

21-
COPY finetune.sh /workspace/wav2vec/
22-
COPY run_common_voice.py /workspace/wav2vec/
21+
COPY finetune.sh run_common_voice.py finetune_with_params.sh /workspace/wav2vec/
22+
2323
COPY home-server.html /usr/bin/home-server.html
2424

2525
RUN chown -R 42420:42420 /workspace
2626

27+
#Default training env variables
28+
ENV model_name_or_path="facebook/wav2vec2-large-xlsr-53" \
29+
dataset_config_name="fr" \
30+
output_dir="/workspace/output_models/wav2vec2-large-xlsr-french-demo" \
31+
cache_dir="/workspace/data" \
32+
num_train_epochs="1" \
33+
per_device_train_batch_size="32" \
34+
evaluation_strategy="steps" \
35+
learning_rate="3e-4" \
36+
warmup_steps="500" \
37+
save_steps="10" \
38+
eval_steps="10" \
39+
save_total_limit="1" \
40+
logging_steps="10" \
41+
feat_proj_dropout="0.0" \
42+
layerdrop="0.1" \
43+
max_train_samples=100 \
44+
max_val_samples=100
45+
46+
WORKDIR /workspace/wav2vec
2747
ENTRYPOINT []
2848
CMD ["supervisord", "-n", "-u", "42420", "-c", "/etc/supervisor/supervisor.conf"]

finetune_with_params.sh

+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#!/usr/bin/env bash
2+
3+
4+
python /workspace/wav2vec/run_common_voice.py \
5+
--model_name_or_path=$model_name_or_path \
6+
--dataset_config_name=$dataset_config_name \
7+
--output_dir=$output_dir \
8+
--cache_dir=$cache_dir \
9+
--overwrite_output_dir \
10+
--num_train_epochs=$num_train_epochs \
11+
--per_device_train_batch_size=$per_device_train_batch_size \
12+
--evaluation_strategy=$evaluation_strategy \
13+
--learning_rate=$learning_rate \
14+
--warmup_steps=$warmup_steps \
15+
--fp16 \
16+
--freeze_feature_extractor \
17+
--save_steps=$save_steps \
18+
--eval_steps=$eval_steps \
19+
--save_total_limit=$save_total_limit \
20+
--logging_steps=$logging_steps \
21+
--group_by_length \
22+
--feat_proj_dropout=$feat_proj_dropout \
23+
--layerdrop=$layerdrop \
24+
--gradient_checkpointing \
25+
--do_train \
26+
--do_eval \
27+
--max_train_samples $max_train_samples \
28+
--max_val_samples $max_val_samples
29+

generate_all_trainings.py

+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
#!/usr/bin/env python
2+
# coding: utf-8
3+
4+
# In[8]:
5+
6+
7+
import os
8+
import csv
9+
10+
11+
# In[20]:
12+
13+
14+
with open('wav2vec_languages.csv') as csv_file:
15+
csv_reader = csv.reader(csv_file, delimiter=',')
16+
# This skips the first row of the CSV file because it's a header
17+
next(csv_reader)
18+
for (language_code, language_full_name) in csv_reader:
19+
print(f"#Launching Training for {language_code}-{language_full_name}")
20+
cmd = f"ovhai job run --gpu 1 --name '{language_code}-{language_full_name}' --volume output_models@GRA:/workspace/output_models:RW:cache -e model_name_or_path='facebook/wav2vec2-large-xlsr-53' -e dataset_config_name={language_code} -e output_dir='/workspace/output_models/wav2vec2-large-xlsr-{language_code}-{language_full_name}-demo' -e cache_dir='/workspace/data' databuzzword/hf-wav2vec -- sh /workspace/wav2vec/finetune_with_params.sh"
21+
print(cmd)
22+
stream = os.popen(cmd)
23+
output = stream.read()
24+
output
25+
26+
27+
# In[3]:
28+
29+
30+
31+
32+
33+
# In[ ]:
34+
35+
36+
37+

wav2vec_languages.csv

+61
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
language_code,language_full_name
2+
ab,abkhazian
3+
ar,arabic
4+
as,assamese
5+
br,breton
6+
ca,catalan
7+
cnh,cnh
8+
cs,czech
9+
cv,chuvash
10+
cy,welsh
11+
de,german
12+
dv,divehi
13+
el,greek
14+
en,english
15+
eo,esperanto
16+
es,spanish
17+
et,estonian
18+
eu,basque
19+
fa,persian
20+
fi,finnish
21+
fr,french
22+
fy-NL,western_frisian-netherlands
23+
ga-IE,irish-ireland
24+
hi,hindi
25+
hsb,upper_sorbian
26+
hu,hungarian
27+
ia,interlingua
28+
id,indonesian
29+
it,italian
30+
ja,japanese
31+
ka,georgian
32+
kab,kabyle
33+
ky,kyrgyz
34+
lg,ganda
35+
lt,lithuanian
36+
lv,latvian
37+
mn,mongolian
38+
mt,maltese
39+
nl,dutch
40+
or,odia
41+
pa-IN,punjabi-india
42+
pl,polish
43+
pt,portuguese
44+
rm-sursilv,romansh_sursilv
45+
rm-vallader,romansh_vallader
46+
ro,romanian
47+
ru,russian
48+
rw,kinyarwanda
49+
sah,sakha
50+
sl,slovenian
51+
sv-SE,swedish-sweden
52+
ta,tamil
53+
th,thai
54+
tr,turkish
55+
tt,tatar
56+
uk,ukrainian
57+
vi,vietnamese
58+
vot,votic
59+
zh-CN,chinese-china
60+
zh-HK,chinese-hong_kong_sar_china
61+
zh-TW,chinese-taiwan

0 commit comments

Comments
 (0)