.gitattributes | |
|
|
.gitignore | |
|
|
LICENSE | |
|
|
NOTICE | |
|
|
README.md | |
|
|
configs/gpt2_tiny_v1.json | |
|
|
configs/gpt_neo_small_v1.json | |
|
|
configs/gpt_neo_small_v2.json | |
|
|
configs/gpt_neo_tiny_v1.json | |
|
|
configs/gpt_neo_tiny_v2.json | |
|
|
data_processing/calc_ngrams.py | 14 | A 0 mins |
|
data_processing/clean_ficbook.py | 45 | A 1 hr |
|
data_processing/convert_math.py | 52 | A 0 mins |
|
data_processing/convert_mc4.py | 79 | C 1 day |
|
data_processing/convert_opensubtitles.py | 77 | A 0 mins |
|
data_processing/convert_pikabu.py | 137 | C 1 day |
|
data_processing/convert_wiki.py | 136 | B 4 hrs |
|
data_processing/convert_yandex_q.py | 70 | B 4 hrs |
|
data_processing/create_ficbook.py | 149 | B 5 hrs |
|
data_processing/create_habr.py | 186 | C 1 day |
|
data_processing/create_librusec.py | 68 | B 6 hrs |
|
data_processing/create_ru_news.py | 174 | D 1 day |
|
data_processing/create_stackoverflow.py | 269 | D 2 days |
|
data_processing/create_stihi.py | 87 | C 1 day |
|
data_processing/exact_undup.py | 33 | A 55 mins |
|
data_processing/filter_with_tokenizer.py | 20 | A 0 mins |
|
data_processing/hf_to_instruct.py | 156 | C 7 hrs |
|
data_processing/lang_detector.py | 12 | A 0 mins |
|
data_processing/merge.py | 33 | A 55 mins |
|
data_processing/parse_fb2.py | 186 | A 2 hrs |
|
data_processing/parse_zip_fb2.py | 61 | B 4 hrs |
|
data_processing/requirements.txt | |
|
|
data_processing/run.sh | |
|
|
data_processing/save_hf.py | 333 | D 2 days |
|
data_processing/save_mc4.py | 120 | D 2 days |
|
data_processing/split.py | 33 | A 45 mins |
|
data_processing/undup.py | 72 | A 55 mins |
|
data_processing/util.py | 243 | A 2 hrs |
|
requirements.txt | |
|
|
resources/mc4_bad_hosts.txt | |
|
|
resources/mc4_good_hosts.txt | |
|
|
resources/mc4_news_hosts.txt | |
|
|
resources/mc4_ru_bad_words.txt | |
|
|
rulm/generate.py | 18 | A 0 mins |
|
rulm/jsonl_loader.py | 53 | B 4 hrs |
|
rulm/preprocess.py | 61 | A 35 mins |
|
rulm/train.py | 103 | A 1 hr |
|
rulm/train_tokenizer.py | 71 | A 1 hr |
|
rulm/util.py | 12 | A 2 hrs |
|
self_instruct/.gitattributes | |
|
|