Spaces:
Running
Running
<html> | |
<head> | |
<script src="distill.bundle.js" type="module" fetchpriority="high" blocking></script> | |
<script src="main.bundle.js" type="module" fetchpriority="low" defer></script> | |
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"> | |
<meta charset="utf8"> | |
<base target="_blank"> | |
<title>Scaling FineWeb to 1000+ languages: Step 1: finding signal in 100s of evaluation tasks</title> | |
<link rel="stylesheet" href="style.css"> | |
</head> | |
<body> | |
<d-front-matter> | |
<script id='distill-front-matter' type="text/json">{ | |
"title": "📝 Scaling FineWeb to 1000+ languages: Step 1: finding signal in 100s of evaluation tasks", | |
"description": "This blog covers a discussion on multilingual evaluation and task signal, the processes for selecting existing evaluation tasks based on signal resulting in FineTasks, and comparisson of open and closed sourced on the FineTasks.", | |
"published": "Oct 23, 2024", | |
"affiliation": {"name": "HuggingFace"}, | |
"authors": [ | |
{ | |
"author":"Hynek Kydlíček", | |
"authorURL":"https://huggingface.co./hynky" | |
}, | |
{ | |
"author":"Guilherme Penedo", | |
"authorURL":"https://huggingface.co./guipenedo" | |
}, | |
{ | |
"author":"Clémentine Fourier", | |
"authorURL":"https://huggingface.co./clefourrier" | |
}, | |
{ | |
"author":"Nathan Habib", | |
"authorURL":"https://huggingface.co./SaylorTwift" | |
}, | |
{ | |
"author":"Thomas Wolf", | |
"authorURL":"https://huggingface.co./thomwolf" | |
} | |
] | |
}</script> | |
</d-front-matter> | |
<d-byline></d-byline> | |
<d-article> | |
<d-contents> | |
</d-contents> | |
<p>We're looking forward to revisiting this analysis in the future, not with just 9 languages, but at least 50—thanks to community contributions! Let's level the playing field between English and other languages together! 🤗</p> | |
<d-math> 1+1=2 </d-math> | |
</d-article> | |
<d-appendix> | |
<d-bibliography src="bibliography.bib"></d-bibliography> | |
<style> | |
d-appendix .citation { | |
font-size: 11px; | |
line-height: 15px; | |
border-left: 1px solid rgba(0, 0, 0, 0.1); | |
padding-left: 18px; | |
border: 1px solid rgba(0,0,0,0.1); | |
background: rgba(0, 0, 0, 0.02); | |
padding: 10px 18px; | |
border-radius: 3px; | |
color: rgba(150, 150, 150, 1); | |
overflow: hidden; | |
margin-top: -12px; | |
white-space: pre-wrap; | |
word-wrap: break-word; | |
} | |
</style> | |
<h3 id="citation">Citation</h3> | |
<p>For attribution in academic contexts, please cite this work as</p> | |
<pre class="citation short">Kydlicek, et al., "FineTasks: Finding signal in a haystack of 200+ multilingual tasks", 2024.</pre> | |
<p>BibTeX citation</p> | |
<pre class="citation long">@misc{kydlicek2024finetasksmultilingualtasks, | |
title={FineTasks: Finding signal in a haystack of 200+ multilingual tasks}, | |
author={Hynek Kydlíček and Guilherme Penedo and Clémentine Fourier and Nathan Habib and Thomas Wolf}, | |
url={https://huggingface.co./spaces/HuggingFaceFW/blogpost-fine-tasks}, | |
}</pre> | |
</d-appendix> | |
<script> | |
const article = document.querySelector('d-article'); | |
const toc = document.querySelector('d-contents'); | |
if (toc) { | |
const headings = article.querySelectorAll('h2, h3, h4'); | |
let ToC = `<nav role="navigation" class="l-text figcaption"><h3>Table of contents</h3>`; | |
let prevLevel = 0; | |
for (const el of headings) { | |
// should element be included in TOC? | |
const isInTitle = el.parentElement.tagName == 'D-TITLE'; | |
const isException = el.getAttribute('no-toc'); | |
if (isInTitle || isException) continue; | |
el.setAttribute('id', el.textContent.toLowerCase().replaceAll(" ", "_")) | |
const link = '<a target="_self" href="' + '#' + el.getAttribute('id') + '">' + el.textContent + '</a>'; | |
const level = el.tagName === 'H2' ? 0 : (el.tagName === 'H3' ? 1 : 2); | |
while (prevLevel < level) { | |
ToC += '<ul>' | |
prevLevel++; | |
} | |
while (prevLevel > level) { | |
ToC += '</ul>' | |
prevLevel--; | |
} | |
if (level === 0) | |
ToC += '<div>' + link + '</div>'; | |
else | |
ToC += '<li>' + link + '</li>'; | |
} | |
while (prevLevel > 0) { | |
ToC += '</ul>' | |
prevLevel--; | |
} | |
ToC += '</nav>'; | |
toc.innerHTML = ToC; | |
toc.setAttribute('prerendered', 'true'); | |
const toc_links = document.querySelectorAll('d-contents > nav a'); | |
window.addEventListener('scroll', (_event) => { | |
if (typeof (headings) != 'undefined' && headings != null && typeof (toc_links) != 'undefined' && toc_links != null) { | |
// Then iterate forwards, on the first match highlight it and break | |
find_active: { | |
for (let i = headings.length - 1; i >= 0; i--) { | |
if (headings[i].getBoundingClientRect().top - 50 <= 0) { | |
if (!toc_links[i].classList.contains("active")) { | |
toc_links.forEach((link, _index) => { | |
link.classList.remove("active"); | |
}); | |
toc_links[i].classList.add('active'); | |
} | |
break find_active; | |
} | |
} | |
toc_links.forEach((link, _index) => { | |
link.classList.remove("active"); | |
}); | |
} | |
} | |
}); | |
} | |
</script> | |
</body> | |
</html> | |