IlyaGusev/rulm

View on GitHub
Branch: master(View all)
NameLines of codeMaintainabilityTest coverage
.gitattributes
.gitignore
LICENSE
NOTICE
README.md
configs/gpt2_tiny_v1.json
configs/gpt_neo_small_v1.json
configs/gpt_neo_small_v2.json
configs/gpt_neo_tiny_v1.json
configs/gpt_neo_tiny_v2.json
data_processing/calc_ngrams.py14
A
0 mins
data_processing/clean_ficbook.py45
A
1 hr
data_processing/convert_math.py52
A
0 mins
data_processing/convert_mc4.py79
C
1 day
data_processing/convert_opensubtitles.py77
A
0 mins
data_processing/convert_pikabu.py137
C
1 day
data_processing/convert_wiki.py136
B
4 hrs
data_processing/convert_yandex_q.py70
B
4 hrs
data_processing/create_ficbook.py149
B
5 hrs
data_processing/create_habr.py186
C
1 day
data_processing/create_librusec.py68
B
6 hrs
data_processing/create_ru_news.py174
D
1 day
data_processing/create_stackoverflow.py269
D
2 days
data_processing/create_stihi.py87
C
1 day
data_processing/exact_undup.py33
A
55 mins
data_processing/filter_with_tokenizer.py20
A
0 mins
data_processing/hf_to_instruct.py156
C
7 hrs
data_processing/lang_detector.py12
A
0 mins
data_processing/merge.py33
A
55 mins
data_processing/parse_fb2.py186
A
2 hrs
data_processing/parse_zip_fb2.py61
B
4 hrs
data_processing/requirements.txt
data_processing/run.sh
data_processing/save_hf.py333
D
2 days
data_processing/save_mc4.py120
D
2 days
data_processing/split.py33
A
45 mins
data_processing/undup.py72
A
55 mins
data_processing/util.py243
A
2 hrs
requirements.txt
resources/mc4_bad_hosts.txt
resources/mc4_good_hosts.txt
resources/mc4_news_hosts.txt
resources/mc4_ru_bad_words.txt
rulm/generate.py18
A
0 mins
rulm/jsonl_loader.py53
B
4 hrs
rulm/preprocess.py61
A
35 mins
rulm/train.py103
A
1 hr
rulm/train_tokenizer.py71
A
1 hr
rulm/util.py12
A
2 hrs
self_instruct/.gitattributes