the Ultrascale-Playbook, a comprehensive book covering all distributed/parallelisation and low-level techniques that can be used to efficiently train models at the largest scales.
\n\n","classNames":"hf-sanitized hf-sanitized-FZgIYn2UNXZivJ9RI70O-"},"users":[{"_id":"5df7e9e5da6d0311fd3d53f9","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/1583857746553-5df7e9e5da6d0311fd3d53f9.jpeg","isPro":true,"fullname":"Thomas Wolf","user":"thomwolf","type":"user"},{"_id":"5ff8c9f4b2035d9a81a859f7","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/1652134289581-5ff8c9f4b2035d9a81a859f7.jpeg","isPro":false,"fullname":"Nouamane Tazi","user":"nouamanetazi","type":"user"},{"_id":"61c141342aac764ce1654e43","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/61c141342aac764ce1654e43/81AwoT5IQ_Xdw0OVw7TKu.jpeg","isPro":false,"fullname":"Loubna Ben Allal","user":"loubnabnl","type":"user"},{"_id":"64622c4093f702673bf9b953","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/noauth/iB42uASF1DoQth23cizSh.png","isPro":false,"fullname":"Ferdinand Mom","user":"3outeille","type":"user"},{"_id":"62e0626a1b0ece20b8aaf2b8","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62e0626a1b0ece20b8aaf2b8/TFFSkfqIcrHgoufx8HGya.png","isPro":false,"fullname":"neuralink","user":"neuralink","type":"user"},{"_id":"63e0eea7af523c37e5a77966","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/1678663263366-63e0eea7af523c37e5a77966.jpeg","isPro":false,"fullname":"Nathan Habib","user":"SaylorTwift","type":"user"},{"_id":"5e48005437cb5b49818287a5","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/5e48005437cb5b49818287a5/4uCXGGui-9QifAT4qelxU.png","isPro":false,"fullname":"Leandro von Werra","user":"lvwerra","type":"user"},{"_id":"62596f9e1c0a084224b93e00","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/X2aLkJ0ofhkXwAg7lXvxD.jpeg","isPro":false,"fullname":"Guilherme Penedo","user":"guipenedo","type":"user"},{"_id":"626ede24d2fa9e7d598c8709","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/626ede24d2fa9e7d598c8709/JKS8-Y2Jw87EgNQZBRswq.jpeg","isPro":true,"fullname":"Hynek Kydlicek","user":"hynky","type":"user"},{"_id":"651e96991b97c9f33d26bde6","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/651e96991b97c9f33d26bde6/-Bqs6qrmz0yCfwtB2e-6q.jpeg","isPro":false,"fullname":"Elie Bakouch","user":"eliebak","type":"user"},{"_id":"660ed80b1889bf2cd53cab7f","avatarUrl":"/avatars/93ee6ff00668c2698ad8b6fa6f072b92.svg","isPro":false,"fullname":"Haojun Zhao","user":"zzhhjjj","type":"user"},{"_id":"6632d7e22c4f4bfc3f6a05c2","avatarUrl":"/avatars/8de694cf8680c548dd8301615437aacd.svg","isPro":false,"fullname":"Mohamed Mekkouri","user":"medmekk","type":"user"}],"collections":[],"datasets":[{"author":"nanotron","downloads":131,"gated":false,"id":"nanotron/ultrascale-playbook-data","lastModified":"2025-03-12T17:28:45.000Z","private":false,"repoType":"dataset","likes":3,"isLikedByUser":false},{"author":"nanotron","downloads":63,"gated":false,"id":"nanotron/minipile_100_samples","lastModified":"2024-07-10T04:02:49.000Z","datasetsServerInfo":{"viewer":"viewer","numRows":100,"libraries":["datasets","pandas","mlcroissant","polars"],"formats":["parquet"],"modalities":["text"]},"private":false,"repoType":"dataset","likes":1,"isLikedByUser":false},{"author":"nanotron","downloads":113,"gated":false,"id":"nanotron/llama3-1024-passkey-retrieval-eval","lastModified":"2024-07-04T13:18:59.000Z","datasetsServerInfo":{"viewer":"viewer","numRows":12600,"libraries":["datasets","pandas","mlcroissant","polars"],"formats":["parquet"],"modalities":["tabular","text"]},"private":false,"repoType":"dataset","likes":0,"isLikedByUser":false},{"author":"nanotron","downloads":97,"gated":false,"id":"nanotron/llama3-16k-passkey-retrieval-finetuning","lastModified":"2024-06-20T10:09:27.000Z","datasetsServerInfo":{"viewer":"viewer","numRows":77250,"libraries":["datasets","dask","mlcroissant","polars"],"formats":["parquet"],"modalities":["tabular","text"]},"private":false,"repoType":"dataset","likes":0,"isLikedByUser":false},{"author":"nanotron","downloads":71,"gated":false,"id":"nanotron/llama3-16k-passkey-retrieval-eval","lastModified":"2024-06-19T09:45:12.000Z","datasetsServerInfo":{"viewer":"viewer","numRows":712,"libraries":["datasets","pandas","mlcroissant","polars"],"formats":["parquet"],"modalities":["tabular","text"]},"private":false,"repoType":"dataset","likes":0,"isLikedByUser":false},{"author":"nanotron","downloads":72,"gated":false,"id":"nanotron/llama3_needle_16k_finetuning","lastModified":"2024-06-15T12:37:13.000Z","datasetsServerInfo":{"viewer":"viewer","numRows":3570,"libraries":["datasets","pandas","mlcroissant","polars"],"formats":["parquet"],"modalities":["tabular","text"]},"private":false,"repoType":"dataset","likes":0,"isLikedByUser":false},{"author":"nanotron","downloads":68,"gated":false,"id":"nanotron/needle_32k_eval_dataset","lastModified":"2024-05-29T12:44:17.000Z","datasetsServerInfo":{"viewer":"viewer","numRows":1785,"libraries":["datasets","pandas","mlcroissant","polars"],"formats":["parquet"],"modalities":["tabular","text"]},"private":false,"repoType":"dataset","likes":1,"isLikedByUser":false},{"author":"nanotron","downloads":127,"gated":false,"id":"nanotron/needle_32k_finetuning_dataset","lastModified":"2024-05-16T06:01:30.000Z","datasetsServerInfo":{"viewer":"viewer","numRows":35500,"libraries":["datasets","dask","mlcroissant","polars"],"formats":["parquet"],"modalities":["tabular","text"]},"private":false,"repoType":"dataset","likes":0,"isLikedByUser":false},{"author":"nanotron","downloads":73,"gated":false,"id":"nanotron/needle_in_a_hay_stack_finetuning_dataset","lastModified":"2024-05-14T11:26:32.000Z","datasetsServerInfo":{"viewer":"viewer","numRows":21,"libraries":["datasets","pandas","mlcroissant","polars"],"formats":["parquet"],"modalities":["tabular","text"]},"private":false,"repoType":"dataset","likes":0,"isLikedByUser":false},{"author":"nanotron","downloads":82,"gated":false,"id":"nanotron/needle_in_a_hay_stack_eval_dataset","lastModified":"2024-05-14T08:05:47.000Z","datasetsServerInfo":{"viewer":"viewer","numRows":1,"libraries":["datasets","pandas","mlcroissant","polars"],"formats":["parquet"],"modalities":["tabular","text"]},"private":false,"repoType":"dataset","likes":1,"isLikedByUser":false},{"author":"nanotron","downloads":429,"gated":false,"id":"nanotron/needle_in_a_hay_stack_finetune_dataset","lastModified":"2024-05-03T06:44:01.000Z","datasetsServerInfo":{"viewer":"viewer","numRows":12600,"libraries":["datasets","pandas","mlcroissant","polars"],"formats":["parquet"],"modalities":["tabular","text"]},"private":false,"repoType":"dataset","likes":1,"isLikedByUser":false},{"author":"nanotron","downloads":136,"gated":false,"id":"nanotron/simple_needle_in_a_hay_stack","lastModified":"2024-04-24T03:24:14.000Z","datasetsServerInfo":{"viewer":"viewer","numRows":12600,"libraries":["datasets","pandas","mlcroissant","polars"],"formats":["parquet"],"modalities":["tabular","text"]},"private":false,"repoType":"dataset","likes":1,"isLikedByUser":false},{"author":"nanotron","downloads":0,"gated":false,"id":"nanotron/the-pile-for-doremi","lastModified":"2024-02-20T06:26:25.000Z","private":false,"repoType":"dataset","likes":0,"isLikedByUser":false}],"models":[{"author":"nanotron","authorData":{"avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/5df7e9e5da6d0311fd3d53f9/qAagSltOINhPaSgZe7roz.png","fullname":"Nanotron Research","name":"nanotron","type":"org","isHf":false,"isMod":false,"isEnterprise":false,"followerCount":192},"downloads":0,"gated":false,"id":"nanotron/temp_for_pr_review","availableInferenceProviders":[],"lastModified":"2024-09-24T19:40:59.000Z","likes":0,"private":false,"repoType":"model","isLikedByUser":false},{"author":"nanotron","authorData":{"avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/5df7e9e5da6d0311fd3d53f9/qAagSltOINhPaSgZe7roz.png","fullname":"Nanotron Research","name":"nanotron","type":"org","isHf":false,"isMod":false,"isEnterprise":false,"followerCount":192},"downloads":0,"gated":false,"id":"nanotron/fp8_for_nanotron","availableInferenceProviders":[],"lastModified":"2024-09-21T13:44:56.000Z","likes":0,"private":false,"repoType":"model","isLikedByUser":false},{"author":"nanotron","authorData":{"avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/5df7e9e5da6d0311fd3d53f9/qAagSltOINhPaSgZe7roz.png","fullname":"Nanotron Research","name":"nanotron","type":"org","isHf":false,"isMod":false,"isEnterprise":false,"followerCount":192},"downloads":6,"gated":false,"id":"nanotron/llama3-8b-infini-attention","availableInferenceProviders":[],"lastModified":"2024-08-05T14:15:16.000Z","likes":3,"private":false,"repoType":"model","isLikedByUser":false},{"author":"nanotron","authorData":{"avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/5df7e9e5da6d0311fd3d53f9/qAagSltOINhPaSgZe7roz.png","fullname":"Nanotron Research","name":"nanotron","type":"org","isHf":false,"isMod":false,"isEnterprise":false,"followerCount":192},"downloads":0,"gated":false,"id":"nanotron/bench_cluster_epfl","availableInferenceProviders":[],"lastModified":"2024-07-12T21:09:05.000Z","likes":0,"private":false,"repoType":"model","isLikedByUser":false},{"author":"nanotron","authorData":{"avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/5df7e9e5da6d0311fd3d53f9/qAagSltOINhPaSgZe7roz.png","fullname":"Nanotron Research","name":"nanotron","type":"org","isHf":false,"isMod":false,"isEnterprise":false,"followerCount":192},"downloads":0,"gated":false,"id":"nanotron/bench_cluster","availableInferenceProviders":[],"lastModified":"2024-07-06T14:20:45.000Z","likes":0,"private":false,"repoType":"model","isLikedByUser":false},{"author":"nanotron","authorData":{"avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/5df7e9e5da6d0311fd3d53f9/qAagSltOINhPaSgZe7roz.png","fullname":"Nanotron Research","name":"nanotron","type":"org","isHf":false,"isMod":false,"isEnterprise":false,"followerCount":192},"downloads":0,"gated":false,"id":"nanotron/test","availableInferenceProviders":[],"lastModified":"2024-07-06T13:15:19.000Z","likes":0,"private":false,"repoType":"model","isLikedByUser":false},{"author":"nanotron","authorData":{"avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/5df7e9e5da6d0311fd3d53f9/qAagSltOINhPaSgZe7roz.png","fullname":"Nanotron Research","name":"nanotron","type":"org","isHf":false,"isMod":false,"isEnterprise":false,"followerCount":192},"downloads":0,"gated":false,"id":"nanotron/old_bench","availableInferenceProviders":[],"lastModified":"2024-07-06T10:20:32.000Z","likes":3,"private":false,"repoType":"model","isLikedByUser":false},{"author":"nanotron","authorData":{"avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/5df7e9e5da6d0311fd3d53f9/qAagSltOINhPaSgZe7roz.png","fullname":"Nanotron Research","name":"nanotron","type":"org","isHf":false,"isMod":false,"isEnterprise":false,"followerCount":192},"downloads":0,"gated":false,"id":"nanotron/minicpm-nanotron","availableInferenceProviders":[],"lastModified":"2024-04-11T15:12:12.000Z","likes":6,"private":false,"repoType":"model","isLikedByUser":false},{"author":"nanotron","authorData":{"avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/5df7e9e5da6d0311fd3d53f9/qAagSltOINhPaSgZe7roz.png","fullname":"Nanotron Research","name":"nanotron","type":"org","isHf":false,"isMod":false,"isEnterprise":false,"followerCount":192},"downloads":0,"gated":false,"id":"nanotron/doremi-llama-2.5b-optimized-weights","availableInferenceProviders":[],"lastModified":"2024-02-22T13:34:15.000Z","likes":0,"private":false,"repoType":"model","isLikedByUser":false},{"author":"nanotron","authorData":{"avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/5df7e9e5da6d0311fd3d53f9/qAagSltOINhPaSgZe7roz.png","fullname":"Nanotron Research","name":"nanotron","type":"org","isHf":false,"isMod":false,"isEnterprise":false,"followerCount":192},"downloads":0,"gated":false,"id":"nanotron/doremi-llama-2.5b-reference","availableInferenceProviders":[],"lastModified":"2024-02-22T13:18:52.000Z","likes":0,"private":false,"repoType":"model","isLikedByUser":false},{"author":"nanotron","authorData":{"avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/5df7e9e5da6d0311fd3d53f9/qAagSltOINhPaSgZe7roz.png","fullname":"Nanotron Research","name":"nanotron","type":"org","isHf":false,"isMod":false,"isEnterprise":false,"followerCount":192},"downloads":0,"gated":false,"id":"nanotron/doremi-llama-280m-proxy","availableInferenceProviders":[],"lastModified":"2024-02-22T13:01:29.000Z","likes":0,"private":false,"repoType":"model","isLikedByUser":false},{"author":"nanotron","authorData":{"avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/5df7e9e5da6d0311fd3d53f9/qAagSltOINhPaSgZe7roz.png","fullname":"Nanotron Research","name":"nanotron","type":"org","isHf":false,"isMod":false,"isEnterprise":false,"followerCount":192},"downloads":0,"gated":false,"id":"nanotron/doremi-llama-280m-reference","availableInferenceProviders":[],"lastModified":"2024-02-19T13:06:25.000Z","likes":0,"private":false,"repoType":"model","isLikedByUser":false},{"author":"nanotron","authorData":{"avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/5df7e9e5da6d0311fd3d53f9/qAagSltOINhPaSgZe7roz.png","fullname":"Nanotron Research","name":"nanotron","type":"org","isHf":false,"isMod":false,"isEnterprise":false,"followerCount":192},"downloads":0,"gated":false,"id":"nanotron/mixtral-nanotron","availableInferenceProviders":[],"lastModified":"2024-02-17T16:34:25.000Z","likes":0,"private":false,"repoType":"model","isLikedByUser":false},{"author":"nanotron","authorData":{"avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/5df7e9e5da6d0311fd3d53f9/qAagSltOINhPaSgZe7roz.png","fullname":"Nanotron Research","name":"nanotron","type":"org","isHf":false,"isMod":false,"isEnterprise":false,"followerCount":192},"downloads":0,"gated":false,"id":"nanotron/mistral-nanotron","availableInferenceProviders":[],"lastModified":"2024-02-09T22:28:08.000Z","likes":1,"private":false,"repoType":"model","isLikedByUser":false}],"spaces":[{"author":"nanotron","authorData":{"avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/5df7e9e5da6d0311fd3d53f9/qAagSltOINhPaSgZe7roz.png","fullname":"Nanotron Research","name":"nanotron","type":"org","isHf":false,"isMod":false,"isEnterprise":false,"followerCount":192},"colorFrom":"indigo","colorTo":"blue","createdAt":"2025-01-21T14:25:12.000Z","emoji":"🧮","id":"nanotron/predict_memory","lastModified":"2025-03-12T15:00:59.000Z","likes":54,"pinned":true,"private":false,"repoType":"space","runtime":{"stage":"RUNNING","hardware":{"current":"cpu-basic","requested":"cpu-basic"},"storage":null,"gcTimeout":172800,"replicas":{"current":1,"requested":1},"devMode":false,"domains":[{"domain":"nanotron-predict-memory.hf.space","stage":"READY"}],"sha":"5a41adf67ba885da0a46129db20c0e5c6c34a700"},"title":"Predict Memory","isLikedByUser":false,"ai_short_description":"Calculate memory usage from model configurations"},{"author":"nanotron","authorData":{"avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/5df7e9e5da6d0311fd3d53f9/qAagSltOINhPaSgZe7roz.png","fullname":"Nanotron Research","name":"nanotron","type":"org","isHf":false,"isMod":false,"isEnterprise":false,"followerCount":192},"colorFrom":"yellow","colorTo":"purple","createdAt":"2024-06-18T17:12:28.000Z","emoji":"🌌","id":"nanotron/ultrascale-playbook","lastModified":"2025-03-11T15:53:47.000Z","likes":2236,"pinned":true,"private":false,"repoType":"space","runtime":{"stage":"RUNNING","hardware":{"current":null,"requested":null},"storage":null,"replicas":{"requested":1,"current":1}},"shortDescription":"The ultimate guide to training LLM on large GPU Clusters","title":"The Ultra-Scale Playbook","isLikedByUser":false}],"repoFilterModels":{"sortKey":"modified"},"repoFilterDatasets":{"sortKey":"modified"},"repoFilterSpaces":{"sortKey":"modified"},"lastOrgActivities":[{"time":"2025-03-12T17:28:48.280Z","user":"nouamanetazi","userAvatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/1652134289581-5ff8c9f4b2035d9a81a859f7.jpeg","orgAvatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/5df7e9e5da6d0311fd3d53f9/qAagSltOINhPaSgZe7roz.png","type":"update","repoData":{"author":"nanotron","downloads":131,"gated":false,"id":"nanotron/ultrascale-playbook-data","lastModified":"2025-03-12T17:28:45.000Z","private":false,"repoType":"dataset","likes":3,"isLikedByUser":false},"repoId":"nanotron/ultrascale-playbook-data","repoType":"dataset","org":"nanotron"},{"time":"2025-03-12T15:01:02.237Z","user":"nouamanetazi","userAvatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/1652134289581-5ff8c9f4b2035d9a81a859f7.jpeg","orgAvatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/5df7e9e5da6d0311fd3d53f9/qAagSltOINhPaSgZe7roz.png","type":"update","repoData":{"author":"nanotron","authorData":{"avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/5df7e9e5da6d0311fd3d53f9/qAagSltOINhPaSgZe7roz.png","fullname":"Nanotron Research","name":"nanotron","type":"org","isHf":false,"isMod":false,"isEnterprise":false,"followerCount":192},"colorFrom":"indigo","colorTo":"blue","createdAt":"2025-01-21T14:25:12.000Z","emoji":"🧮","id":"nanotron/predict_memory","lastModified":"2025-03-12T15:00:59.000Z","likes":54,"pinned":true,"private":false,"sdk":"gradio","repoType":"space","runtime":{"stage":"RUNNING","hardware":{"current":"cpu-basic","requested":"cpu-basic"},"storage":null,"gcTimeout":172800,"replicas":{"current":1,"requested":1},"devMode":false,"domains":[{"domain":"nanotron-predict-memory.hf.space","stage":"READY"}],"sha":"5a41adf67ba885da0a46129db20c0e5c6c34a700"},"title":"Predict Memory","isLikedByUser":false,"ai_short_description":"Calculate memory usage from model configurations","trendingScore":6},"repoId":"nanotron/predict_memory","repoType":"space","org":"nanotron"},{"time":"2025-03-12T01:22:00.480Z","user":"nouamanetazi","userAvatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/1652134289581-5ff8c9f4b2035d9a81a859f7.jpeg","org":"nanotron","orgAvatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/5df7e9e5da6d0311fd3d53f9/qAagSltOINhPaSgZe7roz.png","type":"discussion","discussionData":{"num":86,"author":{"_id":"6169e6f39e1ad4ff8735710b","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/1634330350838-noauth.jpeg","fullname":"Aleksa Gordic","name":"gordicaleksa","type":"user","isPro":true,"isHf":false,"isMod":false,"followerCount":36},"repo":{"name":"nanotron/ultrascale-playbook","type":"space"},"title":"Few Errors","status":"open","createdAt":"2025-02-23T07:47:46.000Z","isPullRequest":false,"numComments":4,"pinned":false,"repoOwner":{"name":"nanotron","isParticipating":true,"type":"org","isDiscussionAuthor":false}},"repoId":"nanotron/ultrascale-playbook","repoType":"space","eventId":"67d0e1b80d83acdd81ae2c6a"}],"acceptLanguages":["en","*"],"blogPosts":[]}">
AI & ML interests
Large scale distributed AI model training, model parallelisation, low-level GPU acceleration, make GPUs go brrrrr
the Ultrascale-Playbook, a comprehensive book covering all distributed/parallelisation and low-level techniques that can be used to efficiently train models at the largest scales.