|
name: comma_v0p1_yolooooo |
|
dump_dir: /fsx/craffel/lingua_logs/comma_v0p1/ |
|
seed: 777 |
|
grad_acc_steps: 4 |
|
gc_collect_freq: 1000 |
|
probe_freq: null |
|
steps: 500000 |
|
data: |
|
root_dir: /scratch/craffel/lingua/data/ |
|
sources: |
|
peS2o: 0.274065475510351 |
|
stackexchange: 0.134617935796937 |
|
stackv2_edu: 0.127770669195666 |
|
cccc: 0.0871992270000557 |
|
wikimedia: 0.0861800315862719 |
|
github_archive: 0.0606452345122248 |
|
uspto: 0.0413469377516883 |
|
pubmed: 0.0367902799837971 |
|
arxiv_papers: 0.0292395449667613 |
|
caselaw_access_project: 0.0193875362722656 |
|
wikiteam: 0.0137485410839637 |
|
doab: 0.0180439781895451 |
|
uk_hansard: 0.0144498535570883 |
|
pre_1929_books: 0.0115755547988338 |
|
ubuntu_irc: 0.00794254267719456 |
|
regulations: 0.00762583706405442 |
|
data_provenance_initiative: 0.00512264496834867 |
|
project_gutenberg: 0.00502100654070129 |
|
youtube: 0.00465917165839394 |
|
arxiv_abstracts: 0.00359635066160403 |
|
stackv2_html: 0.00225924255952781 |
|
usgpo: 0.00226024581728848 |
|
library_of_congress: 0.00222469340783564 |
|
biodiversity_heritage_library: 0.00221737524370278 |
|
pressbooks: 0.000865101033213598 |
|
libretexts: 0.00054149556727006 |
|
news: 0.000372716196818104 |
|
foodista: 0.000125363443065615 |
|
oercommons: 7.78696843693821e-05 |
|
python_enhancement_proposals: 1.69983991984805e-05 |
|
public_domain_review: 1.05448719635173e-05 |
|
batch_size: 2 |
|
seq_len: 4096 |
|
n_views: 2 |
|
seed: 42 |
|
add_bos: true |
|
add_eos: true |
|
load_async: true |
|
prefetch_size: 4096 |
|
tokenizer: |
|
name: tiktoken |
|
path: /fsx/craffel/lingua/tokenizers/common-pile-tokenizer.tiktoken |
|
optim: |
|
lr: 0.001 |
|
weight_decay: 0.2 |
|
epsilon: 1.0e-08 |
|
beta1: 0.9 |
|
beta2: 0.95 |
|
clip: 1.0 |
|
scheduler: cosine |
|
warmup: 2000 |
|
lr_min_ratio: 1.0e-06 |
|
cycle_length: 1.0 |
|
cosine_theta: 1.0 |
|
annealing_step: 1000 |
|
decay_fraction: 0.1 |
|
exp_factor: 0.5 |
|
model: |
|
dim: 4096 |
|
n_layers: 32 |
|
head_dim: null |
|
n_heads: 32 |
|
n_kv_heads: null |
|
ffn_dim_multiplier: 1.0 |
|
multiple_of: 256 |
|
norm_eps: 1.0e-05 |
|
rope_theta: 100000.0 |
|
init_base_std: null |
|
init_std_factor: disabled |
|
max_seqlen: 4096 |
|
seed: 42 |
|
vocab_size: 64256 |
|
weight_tying: false |
|
sliding_window: null |
|
distributed: |
|
dp_shard: 1 |
|
dp_replicate: 64 |
|
tp_size: 1 |
|
selective_activation_checkpointing: false |
|
compile: true |
|
fsdp_type: full_shard |
|
model_dtype: bf16 |
|
float8_recipe: null |
|
float8_filter: layers\.[0-9]+\. |
|
matmul_allow_tf32: false |
|
detect_anomaly: false |
|
compile_cache_size_limit: 8 |
|
spawn_method: forkserver |
|
env: |
|
MKL_SERVICE_FORCE_INTEL: GNU |
|
OMP_NUM_THREADS: '1' |
|
MKL_NUM_THREADS: '1' |
|
ENABLE_INTRA_NODE_COMM: '1' |
|
TORCH_NCCL_AVOID_RECORD_STREAMS: '1' |
|
NCCL_IB_TIMEOUT: '22' |
|
NCCL_DEBUG: INFO |
|
TORCH_NCCL_ASYNC_ERROR_HANDLING: '1' |
|
checkpoint: |
|
dump: |
|
every: 10000 |
|
keep: -1 |
|
eval: |
|
every: 2000 |
|
keep: 3 |
|
path: /fsx/craffel/lingua_logs/comma_v0p1/checkpoints |
|
init_ckpt_path: null |
|
continue_training_from_init: false |
|
profiling: |
|
run: true |
|
trace_folder: profiling |
|
mem_warmup: 0 |
|
mem_steps: 4 |
|
profile_warmup: 100 |
|
profile_steps: 4 |
|
logging: |
|
freq: 1 |
|
acc_freq: null |
|
wandb: null |
|
async_eval_gpus: 8 |
|
eval: |
|
harness: |
|
tasks: |
|
- hellaswag |
|
- task: boolq |
|
dataset_kwargs: |
|
trust_remote_code: true |
|
- piqa |
|
- task: social_iqa |
|
dataset_kwargs: |
|
trust_remote_code: true |
|
- winogrande |
|
- openbookqa |
|
- arc_easy |
|
- arc_challenge |
|
- race |
|
- commonsense_qa |
|
- task: copa |
|
dataset_kwargs: |
|
trust_remote_code: true |
|
- mmlu |
|
- mmlu_pro |
|
generator: |
|
max_tokens: 8192 |
|
dtype: bf16 |
|
|