Install this version:
emerge -a =sci-ml/tokenizers-0.23.1
If this version is masked, you can unmask it using the autounmask tool or standard emerge options:
autounmask =sci-ml/tokenizers-0.23.1
Or alternatively:
emerge --autounmask-write -a =sci-ml/tokenizers-0.23.1
| Version | EAPI | Keywords | Slot |
|---|---|---|---|
| 0.23.1 | 8 | ~amd64 | 0 |
# Copyright 2023-2026 Gentoo Authors
# Distributed under the terms of the GNU General Public License v2
# Autogenerated by pycargoebuild 0.15.0
EAPI=8
DISTUTILS_USE_PEP517=maturin
PYTHON_COMPAT=( python3_{12..14} )
DISTUTILS_EXT=1
DISTUTILS_SINGLE_IMPL=1
RUST_MIN_VER="1.87.0"
CRATES="
"
inherit cargo distutils-r1
DESCRIPTION="Implementation of today's most used tokenizers"
HOMEPAGE="https://github.com/huggingface/tokenizers"
SRC_URI="
https://github.com/huggingface/${PN}/archive/refs/tags/v${PV}.tar.gz
-> ${P}.gh.tar.gz
${CARGO_CRATE_URIS}
"
if [[ ${PKGBUMPING} != ${PVR} ]]; then
SRC_URI+="
https://dev.gentoo.org/~tupone/distfiles/${P}-crates.tar.xz
https://dev.gentoo.org/~tupone/distfiles/${PN}-python-${PV}-crates.tar.xz
"
fi
LICENSE="Apache-2.0"
# Dependent crate licenses
LICENSE+="
Apache-2.0 Apache-2.0-with-LLVM-exceptions BSD-2 BSD ISC MIT MPL-2.0
Unicode-DFS-2016
"
SLOT="0"
KEYWORDS="~amd64"
RDEPEND="dev-libs/oniguruma"
BDEPEND="
test? ( sci-ml/datasets[${PYTHON_SINGLE_USEDEP}] )
$(python_gen_cond_dep '
dev-python/setuptools-rust[${PYTHON_USEDEP}]
')
"
EPYTEST_PLUGINS=( )
distutils_enable_tests pytest
QA_FLAGS_IGNORED=".*/site-packages/tokenizers/.*so"
src_unpack() {
cargo_src_unpack
}
pkg_setup() {
python-single-r1_pkg_setup
rust_pkg_setup
}
src_prepare() {
default
cd bindings/python
eapply "${FILESDIR}"/${PN}-0.21.2-test.patch
distutils-r1_src_prepare
}
src_configure() {
cd tokenizers
cargo_src_configure
cd ../bindings/python
distutils-r1_src_configure
}
src_compile() {
export RUSTONIG_SYSTEM_LIBONIG=1
cd tokenizers
cargo_src_compile
cd ../bindings/python
distutils-r1_src_compile
}
src_test() {
cd tokenizers
# Tests do not work
#cargo_src_test
cd ../bindings/python
local -x EPYTEST_IGNORE=( benches/test_tiktoken.py )
local -x EPYTEST_DESELECT=(
tests/bindings/test_encoding.py::TestEncoding::test_char_to_token
tests/bindings/test_encoding.py::TestEncoding::test_char_to_word
tests/bindings/test_encoding.py::TestEncoding::test_invalid_truncate_direction
tests/bindings/test_encoding.py::TestEncoding::test_n_sequences
tests/bindings/test_encoding.py::TestEncoding::test_sequence_ids
tests/bindings/test_encoding.py::TestEncoding::test_token_to_chars
tests/bindings/test_encoding.py::TestEncoding::test_token_to_sequence
tests/bindings/test_encoding.py::TestEncoding::test_token_to_word
tests/bindings/test_encoding.py::TestEncoding::test_truncation
tests/bindings/test_encoding.py::TestEncoding::test_word_to_chars
tests/bindings/test_encoding.py::TestEncoding::test_word_to_tokens
tests/bindings/test_models.py::TestWordLevel::test_instantiate
tests/bindings/test_models.py::TestWordPiece::test_instantiate
tests/bindings/test_processors.py::TestByteLevelProcessing::test_processing
tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_async_methods_existence
tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_basic_encoding
tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_concurrency
tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_decode
tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_encode
tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_error_handling
tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_large_batch
tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_numpy_inputs
tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_various_input_formats
tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_performance_comparison
tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_with_special_tokens
tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_with_truncation_padding
tests/bindings/test_tokenizer.py::TestTokenizer::test_encode_add_special_tokens
tests/bindings/test_tokenizer.py::TestTokenizer::test_encode_formats
tests/bindings/test_tokenizer.py::TestTokenizer::test_encode_special_tokens
tests/bindings/test_tokenizer.py::TestTokenizer::test_decode_skip_special_tokens
tests/bindings/test_tokenizer.py::TestTokenizer::test_decode_stream_fallback
tests/bindings/test_tokenizer.py::TestTokenizer::test_from_pretrained
tests/bindings/test_tokenizer.py::TestTokenizer::test_from_pretrained_revision
tests/bindings/test_tokenizer.py::TestTokenizer::test_splitting
tests/bindings/test_trainers.py::TestUnigram::test_continuing_prefix_trainer_mismatch
tests/bindings/test_trainers.py::TestUnigram::test_train
tests/bindings/test_trainers.py::TestUnigram::test_train_parallelism_with_custom_pretokenizer
tests/documentation/test_pipeline.py::TestPipeline::test_bert_example
tests/documentation/test_pipeline.py::TestPipeline::test_pipeline
tests/documentation/test_quicktour.py::TestQuicktour::test_quicktour
tests/documentation/test_tutorial_train_from_iterators.py::TestTrainFromIterators::test_datasets
tests/documentation/test_tutorial_train_from_iterators.py::TestTrainFromIterators::test_gzip
tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_add_prefix_space
tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_basic_encode
tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_lowerspace
tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_multiprocessing_with_parallelism
tests/implementations/test_bert_wordpiece.py::TestBertWordPieceTokenizer::test_basic_encode
tests/implementations/test_bert_wordpiece.py::TestBertWordPieceTokenizer::test_multiprocessing_with_parallelism
tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_basic_encode
tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_decoding
tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_lowercase
tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_multiprocessing_with_parallelism
tests/test_serialization.py::TestSerialization::test_full_serialization_albert
tests/test_serialization.py::TestSerialization::test_str_big
)
distutils-r1_src_test
}
src_install() {
cd tokenizers
cd ../bindings/python
distutils-r1_src_install
}
dev-libs/oniguruma
test? ( sci-ml/datasets[${PYTHON_SINGLE_USEDEP}] ) $(python_gen_cond_dep ' dev-python/setuptools-rust[${PYTHON_USEDEP}] ')
| Type | File | Size | Source URLs |
|---|---|---|---|
| DIST | tokenizers-0.23.1.gh.tar.gz | 1613786 bytes | https://github.com/huggingface/tokenizers/archive/refs/tags/v0.23.1.tar.gz |