configs/gpt_neo_tiny_v2.json
{
"block_size": 512,
"model": {
"type": "EleutherAI/gpt-neo-125M",
"num_layers": 6,
"attention_dropout": 0.0,
"embed_dropout": 0.0,
"attention_layers": [
"global",
"local",
"global",
"local",
"global",
"local"
],
"attention_types": [
[
[
"global",
"local"
],
3
]
],
"num_heads": 8,
"hidden_size": 512,
"max_position_embeddings": 512,
"use_cache": true
},
"trainer": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 51,
"per_device_eval_batch_size": 51,
"gradient_accumulation_steps": 7,
"eval_steps": 2000,
"save_steps": 2000,
"logging_steps": 500,
"learning_rate": 0.0006,
"num_train_epochs": null,
"max_steps": 20000,
"lr_scheduler_type": "cosine",
"warmup_steps": 2000,
"weight_decay": 0.01,
"fp16": true,
"bf16": false,
"torch_compile": true,
"gradient_checkpointing": false,
"optim": "adamw_apex_fused",
"half_precision_backend": "auto",
"fp16_opt_level": "O2"
}
}