comma-v0.1 / config.yaml
craffel's picture
craffel HF Staff
Upload config.yaml with huggingface_hub
2705b16 verified
name: comma_v0p1_yolooooo
dump_dir: /fsx/craffel/lingua_logs/comma_v0p1/
seed: 777
grad_acc_steps: 4
gc_collect_freq: 1000
probe_freq: null
steps: 500000
data:
root_dir: /scratch/craffel/lingua/data/
sources:
peS2o: 0.274065475510351
stackexchange: 0.134617935796937
stackv2_edu: 0.127770669195666
cccc: 0.0871992270000557
wikimedia: 0.0861800315862719
github_archive: 0.0606452345122248
uspto: 0.0413469377516883
pubmed: 0.0367902799837971
arxiv_papers: 0.0292395449667613
caselaw_access_project: 0.0193875362722656
wikiteam: 0.0137485410839637
doab: 0.0180439781895451
uk_hansard: 0.0144498535570883
pre_1929_books: 0.0115755547988338
ubuntu_irc: 0.00794254267719456
regulations: 0.00762583706405442
data_provenance_initiative: 0.00512264496834867
project_gutenberg: 0.00502100654070129
youtube: 0.00465917165839394
arxiv_abstracts: 0.00359635066160403
stackv2_html: 0.00225924255952781
usgpo: 0.00226024581728848
library_of_congress: 0.00222469340783564
biodiversity_heritage_library: 0.00221737524370278
pressbooks: 0.000865101033213598
libretexts: 0.00054149556727006
news: 0.000372716196818104
foodista: 0.000125363443065615
oercommons: 7.78696843693821e-05
python_enhancement_proposals: 1.69983991984805e-05
public_domain_review: 1.05448719635173e-05
batch_size: 2
seq_len: 4096
n_views: 2
seed: 42
add_bos: true
add_eos: true
load_async: true
prefetch_size: 4096
tokenizer:
name: tiktoken
path: /fsx/craffel/lingua/tokenizers/common-pile-tokenizer.tiktoken
optim:
lr: 0.001
weight_decay: 0.2
epsilon: 1.0e-08
beta1: 0.9
beta2: 0.95
clip: 1.0
scheduler: cosine
warmup: 2000
lr_min_ratio: 1.0e-06
cycle_length: 1.0
cosine_theta: 1.0
annealing_step: 1000
decay_fraction: 0.1
exp_factor: 0.5
model:
dim: 4096
n_layers: 32
head_dim: null
n_heads: 32
n_kv_heads: null
ffn_dim_multiplier: 1.0
multiple_of: 256
norm_eps: 1.0e-05
rope_theta: 100000.0
init_base_std: null
init_std_factor: disabled
max_seqlen: 4096
seed: 42
vocab_size: 64256
weight_tying: false
sliding_window: null
distributed:
dp_shard: 1
dp_replicate: 64
tp_size: 1
selective_activation_checkpointing: false
compile: true
fsdp_type: full_shard
model_dtype: bf16
float8_recipe: null
float8_filter: layers\.[0-9]+\.
matmul_allow_tf32: false
detect_anomaly: false
compile_cache_size_limit: 8
spawn_method: forkserver
env:
MKL_SERVICE_FORCE_INTEL: GNU
OMP_NUM_THREADS: '1'
MKL_NUM_THREADS: '1'
ENABLE_INTRA_NODE_COMM: '1'
TORCH_NCCL_AVOID_RECORD_STREAMS: '1'
NCCL_IB_TIMEOUT: '22'
NCCL_DEBUG: INFO
TORCH_NCCL_ASYNC_ERROR_HANDLING: '1'
checkpoint:
dump:
every: 10000
keep: -1
eval:
every: 2000
keep: 3
path: /fsx/craffel/lingua_logs/comma_v0p1/checkpoints
init_ckpt_path: null
continue_training_from_init: false
profiling:
run: true
trace_folder: profiling
mem_warmup: 0
mem_steps: 4
profile_warmup: 100
profile_steps: 4
logging:
freq: 1
acc_freq: null
wandb: null
async_eval_gpus: 8
eval:
harness:
tasks:
- hellaswag
- task: boolq
dataset_kwargs:
trust_remote_code: true
- piqa
- task: social_iqa
dataset_kwargs:
trust_remote_code: true
- winogrande
- openbookqa
- arc_easy
- arc_challenge
- race
- commonsense_qa
- task: copa
dataset_kwargs:
trust_remote_code: true
- mmlu
- mmlu_pro
generator:
max_tokens: 8192
dtype: bf16