| Version | EAPI | Keywords | Slot |
|---|---|---|---|
| 0.21.1-r88881 | 8 | ~amd64 | 0 |
# Copyright 2023-2025 Gentoo Authors
# Distributed under the terms of the GNU General Public License v2
# Autogenerated by pycargoebuild 0.13.3
EAPI=8
DISTUTILS_USE_PEP517=maturin
PYTHON_COMPAT=( python3_{10..13} )
DISTUTILS_EXT=1
DISTUTILS_SINGLE_IMPL=1
CRATES="
aho-corasick@1.1.3
autocfg@1.4.0
base64@0.13.1
bit-set@0.8.0
bit-vec@0.8.0
bitflags@1.3.2
cc@1.2.24
cfg-if@1.0.0
console@0.15.11
crossbeam-deque@0.8.6
crossbeam-epoch@0.9.18
crossbeam-utils@0.8.21
darling@0.20.11
darling_core@0.20.11
darling_macro@0.20.11
derive_builder@0.20.2
derive_builder_core@0.20.2
derive_builder_macro@0.20.2
either@1.15.0
esaxx-rs@0.1.10
fancy-regex@0.14.0
fnv@1.0.7
getrandom@0.2.16
heck@0.5.0
ident_case@1.0.1
indicatif@0.17.11
indoc@2.0.6
itertools@0.11.0
itertools@0.12.1
itertools@0.13.0
itoa@1.0.15
lazy_static@1.5.0
libc@0.2.172
log@0.4.27
macro_rules_attribute-proc_macro@0.2.0
macro_rules_attribute@0.2.0
matrixmultiply@0.3.10
memchr@2.7.4
memoffset@0.9.1
minimal-lexical@0.2.1
monostate-impl@0.1.14
monostate@0.1.14
ndarray@0.16.1
nom@7.1.3
num-complex@0.4.6
num-integer@0.1.46
num-traits@0.2.19
number_prefix@0.4.0
numpy@0.25.0
once_cell@1.21.3
onig@6.4.0
onig_sys@69.8.1
paste@1.0.15
pkg-config@0.3.32
portable-atomic@1.11.0
ppv-lite86@0.2.21
proc-macro2@1.0.95
pyo3-build-config@0.25.0
pyo3-ffi@0.25.0
pyo3-macros-backend@0.25.0
pyo3-macros@0.25.0
pyo3@0.25.0
quote@1.0.40
rand@0.8.5
rand_chacha@0.3.1
rand_core@0.6.4
rawpointer@0.2.1
rayon-cond@0.3.0
rayon-core@1.12.1
rayon@1.10.0
regex-automata@0.4.9
regex-syntax@0.8.5
regex@1.11.1
rustc-hash@2.1.1
ryu@1.0.20
serde@1.0.219
serde_derive@1.0.219
serde_json@1.0.140
shlex@1.3.0
smallvec@1.15.0
spm_precompiled@0.1.4
strsim@0.11.1
syn@2.0.101
target-lexicon@0.13.2
thiserror-impl@2.0.12
thiserror@2.0.12
unicode-ident@1.0.18
unicode-normalization-alignments@0.1.12
unicode-segmentation@1.12.0
unicode-width@0.2.0
unicode_categories@0.1.1
unindent@0.2.4
zerocopy@0.8.25
"
inherit cargo distutils-r1
DESCRIPTION="Implementation of today's most used tokenizers"
HOMEPAGE="https://github.com/huggingface/tokenizers"
SRC_URI="
https://github.com/huggingface/${PN}/archive/refs/tags/v${PV}.tar.gz
-> ${P}.gh.tar.gz
${CARGO_CRATE_URIS}
"
S="${WORKDIR}"/${P}/bindings/python
LICENSE="Apache-2.0"
# Dependent crate licenses
LICENSE+="
Apache-2.0 Apache-2.0-with-LLVM-exceptions BSD-2 BSD ISC MIT MPL-2.0
Unicode-DFS-2016
"
SLOT="0"
KEYWORDS="~amd64"
RDEPEND="dev-libs/oniguruma
"
# $(python_gen_cond_dep '
# dev-python/numpy[${PYTHON_USEDEP}]
# ')
BDEPEND="
test? ( sci-ml/datasets[${PYTHON_SINGLE_USEDEP}] )
$(python_gen_cond_dep '
dev-python/setuptools-rust[${PYTHON_USEDEP}]
')
"
# dev-python/numpy[${PYTHON_USEDEP}]
distutils_enable_tests pytest
QA_FLAGS_IGNORED=".*/site-packages/tokenizers/.*so"
src_unpack() {
cargo_src_unpack
}
pkg_setup() {
python-single-r1_pkg_setup
rust_pkg_setup
}
src_prepare() {
# default
# cd bindings/python
distutils-r1_src_prepare
eapply "${FILESDIR}"/${PN}-0.15.2-test.patch
# - replace dependencies.
echo ${FILESDIR}/Cargo-${PVR}.toml "$WORKDIR"/${PN}-${PV}/${PN}/Cargo.toml
cp ${FILESDIR}/Cargo-${PVR}.toml "$WORKDIR"/${PN}-${PV}/${PN}/Cargo.toml
cp ${FILESDIR}/Cargo-${PN}-python-${PVR}.toml "$WORKDIR"/${PN}-${PV}/bindings/python/Cargo.toml
cp ${FILESDIR}/Cargo-console-${PVR}.toml "$WORKDIR"/cargo_home/gentoo/console-0.15.11/Cargo.toml
cp ${FILESDIR}/Cargo-zerocopy-${PVR}.toml "$WORKDIR"/cargo_home/gentoo/zerocopy-0.8.25/Cargo.toml
cp ${FILESDIR}/Cargo-getrandom-${PVR}.toml "$WORKDIR"/cargo_home/gentoo/getrandom-0.2.16/Cargo.toml
cp ${FILESDIR}/Cargo-anstyle-query-${PVR}.toml "$WORKDIR"/cargo_home/gentoo/anstyle-query-1.1.2/Cargo.toml
cp ${FILESDIR}/Cargo-jiff-${PVR}.toml "$WORKDIR"/cargo_home/gentoo/jiff-0.2.14/Cargo.toml
# cp ${FILESDIR}/Cargo-anstream-${PVR}.toml "$WORKDIR"/cargo_home/gentoo/anstream-0.6.18/Cargo.toml
# cp ${FILESDIR}/Cargo-env_logger-${PVR}.toml "$WORKDIR"/cargo_home/gentoo/env_logger-0.11.8/Cargo.toml
eapply ${FILESDIR}/env_logger_disable-${PVR}.patch
cp ${FILESDIR}/Cargo-ndarray-${PVR}.toml "$WORKDIR"/cargo_home/gentoo/ndarray-0.16.1/Cargo.toml
}
src_configure() {
# cd tokenizers
# cargo_gen_config
# --target="$(uname -m)-unknown-linux-gnu"
cargo_src_configure
# cd ../bindings/python
distutils-r1_src_configure
}
python_compile() {
cargo_src_compile
distutils-r1_python_compile
}
src_compile() {
export RUSTONIG_SYSTEM_LIBONIG=1
# cd tokenizers
# cargo_src_compile
# cd ../bindings/python
distutils-r1_src_compile
}
src_test() {
# cd tokenizers
# Tests do not work
#cargo_src_test
# cd ../bindings/python
local -x EPYTEST_IGNORE=( benches/ )
local -x EPYTEST_DESELECT=(
tests/bindings/test_encoding.py::TestEncoding::test_sequence_ids
tests/bindings/test_encoding.py::TestEncoding::test_n_sequences
tests/bindings/test_encoding.py::TestEncoding::test_word_to_tokens
tests/bindings/test_encoding.py::TestEncoding::test_word_to_chars
tests/bindings/test_encoding.py::TestEncoding::test_token_to_sequence
tests/bindings/test_encoding.py::TestEncoding::test_token_to_chars
tests/bindings/test_encoding.py::TestEncoding::test_token_to_word
tests/bindings/test_encoding.py::TestEncoding::test_char_to_token
tests/bindings/test_encoding.py::TestEncoding::test_char_to_word
tests/bindings/test_encoding.py::TestEncoding::test_truncation
tests/bindings/test_encoding.py::TestEncoding::test_invalid_truncate_direction
tests/bindings/test_models.py::TestBPE::test_instantiate
tests/bindings/test_models.py::TestWordLevel::test_instantiate
tests/bindings/test_models.py::TestWordPiece::test_instantiate
tests/bindings/test_processors.py::TestByteLevelProcessing::test_processing
tests/bindings/test_trainers.py::TestUnigram::test_continuing_prefix_trainer_mismatch
tests/bindings/test_trainers.py::TestUnigram::test_train
tests/bindings/test_trainers.py::TestUnigram::test_train_parallelism_with_custom_pretokenizer
tests/documentation/test_pipeline.py::TestPipeline::test_pipeline
tests/documentation/test_pipeline.py::TestPipeline::test_bert_example
tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_basic_encode
tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_lowercase
tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_decoding
tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_multiprocessing_with_parallelism
tests/test_serialization.py::TestSerialization::test_full_serialization_albert
tests/test_serialization.py::TestSerialization::test_str_big
tests/bindings/test_tokenizer.py::TestTokenizer::test_encode_formats
tests/bindings/test_tokenizer.py::TestTokenizer::test_encode_add_special_tokens
tests/bindings/test_tokenizer.py::TestTokenizer::test_from_pretrained
tests/bindings/test_tokenizer.py::TestTokenizer::test_from_pretrained_revision
tests/bindings/test_tokenizer.py::TestTokenizer::test_encode_special_tokens
tests/bindings/test_tokenizer.py::TestTokenizer::test_splitting
tests/documentation/test_quicktour.py::TestQuicktour::test_quicktour
tests/documentation/test_tutorial_train_from_iterators.py::TestTrainFromIterators::test_datasets
tests/documentation/test_tutorial_train_from_iterators.py::TestTrainFromIterators::test_gzip
tests/implementations/test_bert_wordpiece.py::TestBertWordPieceTokenizer::test_basic_encode
tests/implementations/test_bert_wordpiece.py::TestBertWordPieceTokenizer::test_multiprocessing_with_parallelism
tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_basic_encode
tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_add_prefix_space
tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_lowerspace
tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_multiprocessing_with_parallelism
)
distutils-r1_src_test
}
src_install() {
# cd tokenizers
# cd ../bindings/python
distutils-r1_src_install
}
dev-libs/oniguruma
test? ( sci-ml/datasets[${PYTHON_SINGLE_USEDEP}] )
$(python_gen_cond_dep '
dev-python/setuptools-rust[${PYTHON_USEDEP}]
')