| Version | EAPI | Keywords | Slot |
|---|---|---|---|
| 0.22.1 | 8 | ~amd64 | 0 |
# Copyright 2023-2025 Gentoo Authors
# Distributed under the terms of the GNU General Public License v2
# Autogenerated by pycargoebuild 0.15.0
EAPI=8
DISTUTILS_USE_PEP517=maturin
PYTHON_COMPAT=( python3_{10..13} )
DISTUTILS_EXT=1
DISTUTILS_SINGLE_IMPL=1
RUST_MIN_VER="1.82.0"
CRATES="
"
inherit cargo distutils-r1
DESCRIPTION="Implementation of today's most used tokenizers"
HOMEPAGE="https://github.com/huggingface/tokenizers"
SRC_URI="
https://github.com/huggingface/${PN}/archive/refs/tags/v${PV}.tar.gz
-> ${P}.gh.tar.gz
${CARGO_CRATE_URIS}
"
if [[ ${PKGBUMPING} != ${PVR} ]]; then
SRC_URI+="
https://dev.gentoo.org/~tupone/distfiles/${PN}-0.22.0-crates.tar.xz
https://dev.gentoo.org/~tupone/distfiles/${PN}-python-${PV}-crates.tar.xz
"
fi
LICENSE="Apache-2.0"
# Dependent crate licenses
LICENSE+="
Apache-2.0 Apache-2.0-with-LLVM-exceptions BSD-2 BSD ISC MIT MPL-2.0
Unicode-DFS-2016
"
SLOT="0"
KEYWORDS="~amd64"
RDEPEND="dev-libs/oniguruma"
BDEPEND="
test? ( sci-ml/datasets[${PYTHON_SINGLE_USEDEP}] )
$(python_gen_cond_dep '
dev-python/setuptools-rust[${PYTHON_USEDEP}]
')
"
distutils_enable_tests pytest
QA_FLAGS_IGNORED=".*/site-packages/tokenizers/.*so"
src_unpack() {
cargo_src_unpack
}
pkg_setup() {
python-single-r1_pkg_setup
rust_pkg_setup
}
src_prepare() {
default
cd bindings/python
eapply "${FILESDIR}"/${PN}-0.21.2-test.patch
distutils-r1_src_prepare
}
src_configure() {
cd tokenizers
cargo_src_configure
cd ../bindings/python
distutils-r1_src_configure
}
src_compile() {
export RUSTONIG_SYSTEM_LIBONIG=1
cd tokenizers
cargo_src_compile
cd ../bindings/python
distutils-r1_src_compile
}
src_test() {
cd tokenizers
# Tests do not work
#cargo_src_test
cd ../bindings/python
local -x EPYTEST_IGNORE=( benches/ )
local -x EPYTEST_DESELECT=(
tests/bindings/test_encoding.py::TestEncoding::test_sequence_ids
tests/bindings/test_encoding.py::TestEncoding::test_n_sequences
tests/bindings/test_encoding.py::TestEncoding::test_word_to_tokens
tests/bindings/test_encoding.py::TestEncoding::test_word_to_chars
tests/bindings/test_encoding.py::TestEncoding::test_token_to_sequence
tests/bindings/test_encoding.py::TestEncoding::test_token_to_chars
tests/bindings/test_encoding.py::TestEncoding::test_token_to_word
tests/bindings/test_encoding.py::TestEncoding::test_char_to_token
tests/bindings/test_encoding.py::TestEncoding::test_char_to_word
tests/bindings/test_encoding.py::TestEncoding::test_truncation
tests/bindings/test_encoding.py::TestEncoding::test_invalid_truncate_direction
tests/bindings/test_models.py::TestBPE::test_instantiate
tests/bindings/test_models.py::TestWordLevel::test_instantiate
tests/bindings/test_models.py::TestWordPiece::test_instantiate
tests/bindings/test_processors.py::TestByteLevelProcessing::test_processing
tests/bindings/test_trainers.py::TestUnigram::test_continuing_prefix_trainer_mismatch
tests/bindings/test_trainers.py::TestUnigram::test_train
tests/bindings/test_trainers.py::TestUnigram::test_train_parallelism_with_custom_pretokenizer
tests/documentation/test_pipeline.py::TestPipeline::test_pipeline
tests/documentation/test_pipeline.py::TestPipeline::test_bert_example
tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_basic_encode
tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_lowercase
tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_decoding
tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_multiprocessing_with_parallelism
tests/test_serialization.py::TestSerialization::test_full_serialization_albert
tests/test_serialization.py::TestSerialization::test_str_big
tests/bindings/test_tokenizer.py::TestTokenizer::test_encode_formats
tests/bindings/test_tokenizer.py::TestTokenizer::test_encode_add_special_tokens
tests/bindings/test_tokenizer.py::TestTokenizer::test_from_pretrained
tests/bindings/test_tokenizer.py::TestTokenizer::test_from_pretrained_revision
tests/bindings/test_tokenizer.py::TestTokenizer::test_encode_special_tokens
tests/bindings/test_tokenizer.py::TestTokenizer::test_splitting
tests/documentation/test_quicktour.py::TestQuicktour::test_quicktour
tests/documentation/test_tutorial_train_from_iterators.py::TestTrainFromIterators::test_datasets
tests/documentation/test_tutorial_train_from_iterators.py::TestTrainFromIterators::test_gzip
tests/implementations/test_bert_wordpiece.py::TestBertWordPieceTokenizer::test_basic_encode
tests/implementations/test_bert_wordpiece.py::TestBertWordPieceTokenizer::test_multiprocessing_with_parallelism
tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_basic_encode
tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_add_prefix_space
tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_lowerspace
tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_multiprocessing_with_parallelism
)
distutils-r1_src_test
}
src_install() {
cd tokenizers
cd ../bindings/python
distutils-r1_src_install
}
dev-libs/oniguruma
test? ( sci-ml/datasets[${PYTHON_SINGLE_USEDEP}] )
$(python_gen_cond_dep '
dev-python/setuptools-rust[${PYTHON_USEDEP}]
')
| Type | File | Size | Source URLs |
|---|---|---|---|
| DIST | tokenizers-0.22.1.gh.tar.gz | 1563505 bytes | https://github.com/huggingface/tokenizers/archive/refs/tags/v0.22.1.tar.gz |