| Version | EAPI | Keywords | Slot |
|---|---|---|---|
| 0.21.1-r1 | 8 | ~amd64 | 0 |
# Copyright 2023-2025 Gentoo Authors
# Distributed under the terms of the GNU General Public License v2
# Autogenerated by pycargoebuild 0.13.3
EAPI=8
DISTUTILS_USE_PEP517=maturin
PYTHON_COMPAT=( python3_{10..13} )
DISTUTILS_EXT=1
DISTUTILS_SINGLE_IMPL=1
CRATES="
addr2line@0.24.2
adler2@2.0.0
aho-corasick@1.1.3
anes@0.1.4
anstream@0.6.18
anstyle@1.0.10
anstyle-parse@0.2.6
anstyle-query@1.1.2
anstyle-wincon@3.0.6
assert_approx_eq@1.1.0
autocfg@1.4.0
backtrace@0.3.74
base64@0.13.1
base64@0.21.7
base64@0.22.1
bitflags@1.3.2
bitflags@2.4.0
bitflags@2.6.0
bit-set@0.8.0
bit-vec@0.8.0
bumpalo@3.16.0
byteorder@1.5.0
bytes@1.10.1
cast@0.3.0
cc@1.2.6
cc@1.2.8
cfg_aliases@0.2.1
cfg-if@1.0.0
ciborium@0.2.2
ciborium-io@0.2.2
ciborium-ll@0.2.2
clap@4.5.35
clap_builder@4.5.35
clap_lex@0.7.4
colorchoice@1.0.3
console@0.15.10
crc32fast@1.4.2
criterion@0.5.1
criterion-plot@0.5.0
crossbeam-deque@0.8.6
crossbeam-epoch@0.9.18
crossbeam-utils@0.8.21
crunchy@0.2.3
darling@0.20.10
darling_core@0.20.10
darling_macro@0.20.10
derive_builder@0.20.2
derive_builder_core@0.20.2
derive_builder_macro@0.20.2
dirs@5.0.1
dirs-sys@0.4.1
displaydoc@0.2.5
either@1.13.0
encode_unicode@1.0.0
env_filter@0.1.3
env_logger@0.11.6
errno@0.3.10
esaxx-rs@0.1.10
fancy-regex@0.14.0
fastrand@2.3.0
flate2@1.1.1
fnv@1.0.7
form_urlencoded@1.2.1
futures-channel@0.3.31
futures-core@0.3.31
futures-io@0.3.31
futures-macro@0.3.31
futures-sink@0.3.31
futures-task@0.3.31
futures-util@0.3.31
getrandom@0.2.15
getrandom@0.3.0
gimli@0.31.1
half@2.5.0
heck@0.5.0
hermit-abi@0.5.0
hf-hub@0.4.2
http@1.3.1
http-body@1.0.1
http-body-util@0.1.3
httparse@1.10.1
humantime@2.1.0
hyper@1.6.0
hyper-rustls@0.27.5
hyper-util@0.1.11
icu_collections@1.5.0
icu_locid@1.5.0
icu_locid_transform@1.5.0
icu_locid_transform_data@1.5.1
icu_normalizer@1.5.0
icu_normalizer_data@1.5.1
icu_properties@1.5.1
icu_properties_data@1.5.1
icu_provider@1.5.0
icu_provider_macros@1.5.0
ident_case@1.0.1
idna@1.0.3
idna_adapter@1.2.0
indicatif@0.17.9
indoc@2.0.5
ipnet@2.11.0
is-terminal@0.4.16
is_terminal_polyfill@1.70.1
itertools@0.10.5
itertools@0.11.0
itertools@0.12.1
itertools@0.13.0
itoa@1.0.14
js-sys@0.3.76
js-sys@0.3.77
lazy_static@1.5.0
libc@0.2.169
libc@0.2.171
libredox@0.1.3
linux-raw-sys@0.4.14
litemap@0.7.5
log@0.4.22
macro_rules_attribute@0.2.0
macro_rules_attribute-proc_macro@0.2.0
matrixmultiply@0.3.9
memchr@2.7.4
memoffset@0.9.1
mime@0.3.17
minimal-lexical@0.2.1
miniz_oxide@0.8.7
mio@1.0.3
monostate@0.1.13
monostate-impl@0.1.13
ndarray@0.16.1
nom@7.1.3
nu-ansi-term@0.46.0
number_prefix@0.4.0
num-complex@0.4.6
num-integer@0.1.46
numpy@0.23.0
num-traits@0.2.19
object@0.36.7
once_cell@1.20.2
onig@6.4.0
onig_sys@69.8.1
oorandom@11.1.5
option-ext@0.2.0
overload@0.1.1
paste@1.0.15
percent-encoding@2.3.1
pin-project-lite@0.2.16
pin-utils@0.1.0
pkg-config@0.3.31
plotters@0.3.7
plotters-backend@0.3.7
plotters-svg@0.3.7
portable-atomic@1.10.0
portable-atomic-util@0.2.4
ppv-lite86@0.2.20
proc-macro2@1.0.92
pyo3@0.23.5
pyo3-build-config@0.23.5
pyo3-ffi@0.23.5
pyo3-macros@0.23.5
pyo3-macros-backend@0.23.5
quinn@0.11.7
quinn-proto@0.11.10
quinn-udp@0.5.11
quote@1.0.38
rand@0.8.5
rand@0.9.0
rand_chacha@0.3.1
rand_chacha@0.9.0
rand_core@0.6.4
rand_core@0.9.0
rawpointer@0.2.1
rayon@1.10.0
rayon-cond@0.3.0
rayon-core@1.12.1
redox_users@0.4.6
regex@1.11.1
regex-automata@0.4.9
regex-syntax@0.8.5
reqwest@0.12.15
ring@0.17.14
rustc-demangle@0.1.24
rustc-hash@2.1.0
rustix@0.38.42
rustls@0.21.12
rustls@0.23.25
rustls-pemfile@2.2.0
rustls-pki-types@1.11.0
rustls-webpki@0.101.7
rustls-webpki@0.103.1
rustversion@1.0.20
ryu@1.0.18
same-file@1.0.6
sct@0.7.1
serde@1.0.217
serde_derive@1.0.217
serde_json@1.0.134
serde_urlencoded@0.7.1
sharded-slab@0.1.7
shlex@1.3.0
slab@0.4.9
smallvec@1.13.2
socket2@0.5.9
socks@0.3.4
spm_precompiled@0.1.4
stable_deref_trait@1.2.0
strsim@0.11.1
subtle@2.6.1
syn@2.0.93
synstructure@0.13.1
sync_wrapper@1.0.2
target-lexicon@0.12.16
tempfile@3.14.0
thiserror@1.0.69
thiserror@2.0.9
thiserror-impl@1.0.69
thiserror-impl@2.0.9
thread_local@1.1.8
tinytemplate@1.2.1
tinyvec@1.9.0
tinyvec_macros@0.1.1
tokio@1.44.1
tokio-rustls@0.26.2
tokio-util@0.7.14
tower@0.5.2
tower-layer@0.3.3
tower-service@0.3.3
tracing@0.1.41
tracing-attributes@0.1.28
tracing-core@0.1.33
tracing-log@0.2.0
tracing-subscriber@0.3.19
try-lock@0.2.5
tinystr@0.7.5
unicode_categories@0.1.1
unicode-ident@1.0.14
unicode-normalization-alignments@0.1.12
unicode-segmentation@1.12.0
unicode-width@0.2.0
unindent@0.2.3
untrusted@0.9.0
ureq@2.8.0
url@2.5.4
utf16_iter@1.0.5
utf8_iter@1.0.4
utf8parse@0.2.2
valuable@0.1.1
walkdir@2.5.0
want@0.3.1
wasi@0.11.0+wasi-snapshot-preview1
wasi@0.13.3+wasi-0.2.2
wasm-bindgen@0.2.99
wasm-bindgen@0.2.100
wasm-bindgen-backend@0.2.99
wasm-bindgen-backend@0.2.100
wasm-bindgen-futures@0.4.50
wasm-bindgen-macro@0.2.99
wasm-bindgen-macro@0.2.100
wasm-bindgen-macro-support@0.2.99
wasm-bindgen-macro-support@0.2.100
wasm-bindgen-shared@0.2.99
wasm-bindgen-shared@0.2.100
wasm-streams@0.4.2
webpki-roots@0.25.4
webpki-roots@0.26.8
web-sys@0.3.77
web-time@1.1.0
winapi@0.3.9
winapi-i686-pc-windows-gnu@0.4.0
winapi-util@0.1.9
winapi-x86_64-pc-windows-gnu@0.4.0
windows_aarch64_gnullvm@0.48.0
windows_aarch64_gnullvm@0.52.6
windows_aarch64_gnullvm@0.53.0
windows_aarch64_msvc@0.48.0
windows_aarch64_msvc@0.52.6
windows_aarch64_msvc@0.53.0
windows_i686_gnu@0.48.0
windows_i686_gnu@0.52.6
windows_i686_gnu@0.53.0
windows_i686_gnullvm@0.52.6
windows_i686_gnullvm@0.53.0
windows_i686_msvc@0.48.0
windows_i686_msvc@0.52.6
windows_i686_msvc@0.53.0
windows-link@0.1.1
windows-registry@0.4.0
windows-result@0.3.2
windows-strings@0.3.0
windows-sys@0.48.0
windows-sys@0.52.0
windows-sys@0.59.0
windows-targets@0.48.0
windows-targets@0.52.6
windows-targets@0.53.0
windows_x86_64_gnu@0.48.0
windows_x86_64_gnu@0.52.6
windows_x86_64_gnu@0.53.0
windows_x86_64_gnullvm@0.48.0
windows_x86_64_gnullvm@0.52.6
windows_x86_64_gnullvm@0.53.0
windows_x86_64_msvc@0.48.0
windows_x86_64_msvc@0.52.6
windows_x86_64_msvc@0.53.0
wit-bindgen-rt@0.33.0
write16@1.0.0
writeable@0.5.5
yoke@0.7.5
yoke-derive@0.7.5
zerocopy@0.7.35
zerocopy@0.8.24
zerocopy-derive@0.7.35
zerocopy-derive@0.8.24
zerofrom@0.1.6
zerofrom-derive@0.1.6
zeroize@1.8.1
zerovec@0.10.2
zerovec-derive@0.10.2
"
inherit cargo distutils-r1
DESCRIPTION="Implementation of today's most used tokenizers"
HOMEPAGE="https://github.com/huggingface/tokenizers"
SRC_URI="
https://github.com/huggingface/${PN}/archive/refs/tags/v${PV}.tar.gz
-> ${P}.gh.tar.gz
${CARGO_CRATE_URIS}
"
LICENSE="Apache-2.0"
# Dependent crate licenses
LICENSE+="
Apache-2.0 Apache-2.0-with-LLVM-exceptions BSD-2 BSD ISC MIT MPL-2.0
Unicode-DFS-2016
"
SLOT="0"
KEYWORDS="~amd64"
RDEPEND="dev-libs/oniguruma"
BDEPEND="
test? ( sci-ml/datasets[${PYTHON_SINGLE_USEDEP}] )
$(python_gen_cond_dep '
dev-python/setuptools-rust[${PYTHON_USEDEP}]
')
"
distutils_enable_tests pytest
QA_FLAGS_IGNORED=".*/site-packages/tokenizers/.*so"
src_unpack() {
cargo_src_unpack
}
pkg_setup() {
python-single-r1_pkg_setup
rust_pkg_setup
}
src_prepare() {
default
cd bindings/python
eapply "${FILESDIR}"/${PN}-0.15.2-test.patch
distutils-r1_src_prepare
}
src_configure() {
cd tokenizers
cargo_src_configure
cd ../bindings/python
distutils-r1_src_configure
}
src_compile() {
export RUSTONIG_SYSTEM_LIBONIG=1
cd tokenizers
cargo_src_compile
cd ../bindings/python
distutils-r1_src_compile
}
src_test() {
cd tokenizers
# Tests do not work
#cargo_src_test
cd ../bindings/python
local -x EPYTEST_IGNORE=( benches/ )
local -x EPYTEST_DESELECT=(
tests/bindings/test_encoding.py::TestEncoding::test_sequence_ids
tests/bindings/test_encoding.py::TestEncoding::test_n_sequences
tests/bindings/test_encoding.py::TestEncoding::test_word_to_tokens
tests/bindings/test_encoding.py::TestEncoding::test_word_to_chars
tests/bindings/test_encoding.py::TestEncoding::test_token_to_sequence
tests/bindings/test_encoding.py::TestEncoding::test_token_to_chars
tests/bindings/test_encoding.py::TestEncoding::test_token_to_word
tests/bindings/test_encoding.py::TestEncoding::test_char_to_token
tests/bindings/test_encoding.py::TestEncoding::test_char_to_word
tests/bindings/test_encoding.py::TestEncoding::test_truncation
tests/bindings/test_encoding.py::TestEncoding::test_invalid_truncate_direction
tests/bindings/test_models.py::TestBPE::test_instantiate
tests/bindings/test_models.py::TestWordLevel::test_instantiate
tests/bindings/test_models.py::TestWordPiece::test_instantiate
tests/bindings/test_processors.py::TestByteLevelProcessing::test_processing
tests/bindings/test_trainers.py::TestUnigram::test_continuing_prefix_trainer_mismatch
tests/bindings/test_trainers.py::TestUnigram::test_train
tests/bindings/test_trainers.py::TestUnigram::test_train_parallelism_with_custom_pretokenizer
tests/documentation/test_pipeline.py::TestPipeline::test_pipeline
tests/documentation/test_pipeline.py::TestPipeline::test_bert_example
tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_basic_encode
tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_lowercase
tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_decoding
tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_multiprocessing_with_parallelism
tests/test_serialization.py::TestSerialization::test_full_serialization_albert
tests/test_serialization.py::TestSerialization::test_str_big
tests/bindings/test_tokenizer.py::TestTokenizer::test_encode_formats
tests/bindings/test_tokenizer.py::TestTokenizer::test_encode_add_special_tokens
tests/bindings/test_tokenizer.py::TestTokenizer::test_from_pretrained
tests/bindings/test_tokenizer.py::TestTokenizer::test_from_pretrained_revision
tests/bindings/test_tokenizer.py::TestTokenizer::test_encode_special_tokens
tests/bindings/test_tokenizer.py::TestTokenizer::test_splitting
tests/documentation/test_quicktour.py::TestQuicktour::test_quicktour
tests/documentation/test_tutorial_train_from_iterators.py::TestTrainFromIterators::test_datasets
tests/documentation/test_tutorial_train_from_iterators.py::TestTrainFromIterators::test_gzip
tests/implementations/test_bert_wordpiece.py::TestBertWordPieceTokenizer::test_basic_encode
tests/implementations/test_bert_wordpiece.py::TestBertWordPieceTokenizer::test_multiprocessing_with_parallelism
tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_basic_encode
tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_add_prefix_space
tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_lowerspace
tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_multiprocessing_with_parallelism
)
distutils-r1_src_test
}
src_install() {
cd tokenizers
cd ../bindings/python
distutils-r1_src_install
}
dev-libs/oniguruma
test? ( sci-ml/datasets[${PYTHON_SINGLE_USEDEP}] )
$(python_gen_cond_dep '
dev-python/setuptools-rust[${PYTHON_USEDEP}]
')