| Version | EAPI | Keywords | Slot |
|---|---|---|---|
| 0.21.0 | 8 | ~amd64 | 0 |
# Copyright 2023-2025 Gentoo Authors
# Distributed under the terms of the GNU General Public License v2
# Autogenerated by pycargoebuild 0.13.3
EAPI=8
DISTUTILS_USE_PEP517=maturin
PYTHON_COMPAT=( python3_{10..13} )
DISTUTILS_EXT=1
DISTUTILS_SINGLE_IMPL=1
CRATES="
adler2@2.0.0
aho-corasick@1.1.3
anes@0.1.6
anstream@0.6.18
anstyle@1.0.10
anstyle-parse@0.2.6
anstyle-query@1.1.2
anstyle-wincon@3.0.6
assert_approx_eq@1.1.0
autocfg@1.4.0
base64@0.13.1
base64@0.22.1
bit-set@0.5.3
bit-vec@0.6.3
bitflags@1.3.2
bitflags@2.6.0
bitflags@2.7.0
bumpalo@3.16.0
byteorder@1.5.0
cast@0.3.0
cc@1.2.1
cc@1.2.8
cfg-if@1.0.0
ciborium-io@0.2.2
ciborium-ll@0.2.2
ciborium@0.2.2
clap@4.5.26
clap_builder@4.5.26
clap_lex@0.7.4
colorchoice@1.0.3
console@0.15.8
console@0.15.10
core-foundation-sys@0.8.7
core-foundation@0.9.4
crc32fast@1.4.2
criterion-plot@0.5.0
criterion@0.5.1
crossbeam-deque@0.8.5
crossbeam-deque@0.8.6
crossbeam-epoch@0.9.18
crossbeam-utils@0.8.20
crossbeam-utils@0.8.21
crunchy@0.2.2
darling@0.20.10
darling_core@0.20.10
darling_macro@0.20.10
derive_builder@0.20.2
derive_builder_core@0.20.2
derive_builder_macro@0.20.2
dirs-sys@0.4.1
dirs@5.0.1
displaydoc@0.2.5
either@1.13.0
encode_unicode@0.3.6
encode_unicode@1.0.0
env_filter@0.1.2
env_logger@0.11.5
erased-serde@0.4.5
errno@0.3.9
errno@0.3.10
esaxx-rs@0.1.10
fancy-regex@0.13.0
fastrand@2.2.0
fastrand@2.3.0
flate2@1.0.35
fnv@1.0.7
foreign-types-shared@0.1.1
foreign-types@0.3.2
form_urlencoded@1.2.1
getrandom@0.2.15
half@2.4.1
heck@0.5.0
hermit-abi@0.4.0
hf-hub@0.3.2
humantime@2.1.0
icu_collections@1.5.0
icu_locid@1.5.0
icu_locid_transform@1.5.0
icu_locid_transform_data@1.5.0
icu_normalizer@1.5.0
icu_normalizer_data@1.5.0
icu_properties@1.5.1
icu_properties_data@1.5.0
icu_provider@1.5.0
icu_provider_macros@1.5.0
ident_case@1.0.1
idna@1.0.3
idna_adapter@1.2.0
indicatif@0.17.9
indoc@2.0.5
is-terminal@0.4.13
itertools@0.10.5
itertools@0.11.0
itertools@0.12.1
itoa@1.0.14
js-sys@0.3.72
js-sys@0.3.76
is_terminal_polyfill@1.70.1
lazy_static@1.5.0
libc@0.2.166
libc@0.2.169
libredox@0.1.3
linux-raw-sys@0.4.14
linux-raw-sys@0.4.15
litemap@0.7.4
log@0.4.22
log@0.4.24
macro_rules_attribute-proc_macro@0.2.0
macro_rules_attribute@0.2.0
matrixmultiply@0.3.9
memchr@2.7.4
memoffset@0.9.1
minimal-lexical@0.2.1
miniz_oxide@0.8.2
monostate-impl@0.1.13
monostate@0.1.13
native-tls@0.2.12
ndarray@0.15.6
ndarray@0.16.1
nom@7.1.3
nu-ansi-term@0.46.0
numpy@0.22.1
num-complex@0.4.6
num-integer@0.1.46
num-traits@0.2.19
number_prefix@0.4.0
once_cell@1.20.2
onig@6.4.0
onig_sys@69.8.1
oorandom@11.1.4
openssl-macros@0.1.1
openssl-probe@0.1.5
openssl-sys@0.9.104
openssl@0.10.68
option-ext@0.2.0
overload@0.1.1
paste@1.0.15
percent-encoding@2.3.1
pin-project-lite@0.2.16
pkg-config@0.3.31
plotters-backend@0.3.7
plotters-svg@0.3.7
plotters@0.3.7
portable-atomic-util@0.2.4
portable-atomic@1.10.0
ppv-lite86@0.2.20
proc-macro2@1.0.92
proc-macro2@1.0.93
pyo3-build-config@0.22.6
pyo3-ffi@0.22.6
pyo3-macros-backend@0.22.6
pyo3-macros@0.22.6
pyo3@0.22.6
quote@1.0.37
quote@1.0.38
rand@0.8.5
rand_chacha@0.3.1
rand_core@0.6.4
rawpointer@0.2.1
rayon-cond@0.3.0
rayon-core@1.12.1
rayon@1.10.0
redox_users@0.4.6
regex-automata@0.4.9
regex-syntax@0.8.5
regex@1.11.1
ring@0.17.8
rustc-hash@1.1.0
rustix@0.38.41
rustix@0.38.43
rustls-pki-types@1.10.1
rustls-webpki@0.102.8
rustls@0.23.21
ryu@1.0.18
same-file@1.0.6
schannel@0.1.27
security-framework-sys@2.14.0
security-framework@2.11.1
serde@1.0.215
serde@1.0.217
serde_derive@1.0.215
serde_derive@1.0.217
serde_fmt@1.0.3
serde_json@1.0.133
serde_json@1.0.135
sharded-slab@0.1.7
shlex@1.3.0
smallvec@1.13.2
spin@0.9.8
spm_precompiled@0.1.4
stable_deref_trait@1.2.0
strsim@0.11.1
subtle@2.6.1
sval@2.13.2
sval_buffer@2.13.2
sval_dynamic@2.13.2
sval_fmt@2.13.2
sval_json@2.13.2
sval_nested@2.13.2
sval_ref@2.13.2
sval_serde@2.13.2
syn@2.0.89
syn@2.0.96
synstructure@0.13.1
target-lexicon@0.12.16
tempfile@3.14.0
tempfile@3.15.0
thiserror-impl@1.0.69
thiserror@1.0.69
thread_local@1.1.8
tinystr@0.7.6
tinytemplate@1.2.1
tracing-attributes@0.1.28
tracing-core@0.1.33
tracing-log@0.2.0
tracing-subscriber@0.3.19
tracing@0.1.41
typeid@1.0.2
unicode-ident@1.0.14
unicode-normalization-alignments@0.1.12
unicode-segmentation@1.12.0
unicode-width@0.1.14
unicode-width@0.2.0
unicode_categories@0.1.1
unindent@0.2.3
untrusted@0.9.0
ureq@2.12.1
url@2.5.4
utf16_iter@1.0.5
utf8parse@0.2.2
utf8_iter@1.0.4
valuable@0.1.0
value-bag-serde1@1.10.0
value-bag-sval2@1.10.0
value-bag@1.10.0
vcpkg@0.2.15
walkdir@2.5.0
wasi@0.11.0+wasi-snapshot-preview1
wasm-bindgen-backend@0.2.95
wasm-bindgen-backend@0.2.99
wasm-bindgen-macro-support@0.2.95
wasm-bindgen-macro-support@0.2.99
wasm-bindgen-macro@0.2.95
wasm-bindgen-macro@0.2.99
wasm-bindgen-shared@0.2.95
wasm-bindgen-shared@0.2.99
wasm-bindgen@0.2.95
wasm-bindgen@0.2.99
web-sys@0.3.76
web-time@1.1.0
webpki-roots@0.26.7
winapi-i686-pc-windows-gnu@0.4.0
winapi-util@0.1.9
winapi-x86_64-pc-windows-gnu@0.4.0
winapi@0.3.9
windows-sys@0.48.0
windows-sys@0.52.0
windows-sys@0.59.0
windows-targets@0.48.5
windows-targets@0.52.6
windows_aarch64_gnullvm@0.48.5
windows_aarch64_gnullvm@0.52.6
windows_aarch64_msvc@0.48.5
windows_aarch64_msvc@0.52.6
windows_i686_gnu@0.48.5
windows_i686_gnu@0.52.6
windows_i686_gnullvm@0.52.6
windows_i686_msvc@0.48.5
windows_i686_msvc@0.52.6
windows_x86_64_gnu@0.48.5
windows_x86_64_gnu@0.52.6
windows_x86_64_gnullvm@0.48.5
windows_x86_64_gnullvm@0.52.6
windows_x86_64_msvc@0.48.5
windows_x86_64_msvc@0.52.6
write16@1.0.0
writeable@0.5.5
yoke-derive@0.7.5
yoke@0.7.5
zerocopy-derive@0.7.35
zerocopy@0.7.35
zerofrom-derive@0.1.5
zerofrom@0.1.5
zeroize@1.8.1
zerovec-derive@0.10.3
zerovec@0.10.4
"
inherit cargo distutils-r1
DESCRIPTION="Implementation of today's most used tokenizers"
HOMEPAGE="https://github.com/huggingface/tokenizers"
SRC_URI="
https://github.com/huggingface/${PN}/archive/refs/tags/v${PV}.tar.gz
-> ${P}.gh.tar.gz
${CARGO_CRATE_URIS}
"
LICENSE="Apache-2.0"
# Dependent crate licenses
LICENSE+="
Apache-2.0 Apache-2.0-with-LLVM-exceptions BSD-2 BSD ISC MIT MPL-2.0
Unicode-DFS-2016
"
SLOT="0"
KEYWORDS="~amd64"
BDEPEND="
test? ( sci-ml/datasets[${PYTHON_SINGLE_USEDEP}] )
$(python_gen_cond_dep '
dev-python/setuptools-rust[${PYTHON_USEDEP}]
')
"
distutils_enable_tests pytest
QA_FLAGS_IGNORED=".*/site-packages/tokenizers/.*so"
src_unpack() {
cargo_src_unpack
}
pkg_setup() {
python-single-r1_pkg_setup
rust_pkg_setup
}
src_prepare() {
default
cd bindings/python
eapply "${FILESDIR}"/${PN}-0.15.2-test.patch
distutils-r1_src_prepare
}
src_configure() {
cd tokenizers
cargo_src_configure
cd ../bindings/python
distutils-r1_src_configure
}
src_compile() {
cd tokenizers
cargo_src_compile
cd ../bindings/python
distutils-r1_src_compile
}
src_test() {
cd tokenizers
# Tests do not work
#cargo_src_test
cd ../bindings/python
local EPYTEST_DESELECT=(
"tests/bindings/test_tokenizer.py::TestTokenizer::test_encode_formats"
"tests/bindings/test_encoding.py::TestEncoding::test_sequence_ids"
"tests/bindings/test_encoding.py::TestEncoding::test_n_sequences"
"tests/bindings/test_encoding.py::TestEncoding::test_word_to_tokens"
"tests/bindings/test_encoding.py::TestEncoding::test_word_to_chars"
"tests/bindings/test_encoding.py::TestEncoding::test_token_to_sequence"
"tests/bindings/test_encoding.py::TestEncoding::test_token_to_chars"
"tests/bindings/test_encoding.py::TestEncoding::test_token_to_word"
"tests/bindings/test_encoding.py::TestEncoding::test_char_to_token"
"tests/bindings/test_encoding.py::TestEncoding::test_char_to_word"
"tests/bindings/test_encoding.py::TestEncoding::test_truncation"
"tests/bindings/test_encoding.py::TestEncoding::test_invalid_truncate_direction"
"tests/bindings/test_models.py::TestBPE::test_instantiate"
"tests/bindings/test_models.py::TestWordLevel::test_instantiate"
"tests/bindings/test_models.py::TestWordPiece::test_instantiate"
"tests/bindings/test_processors.py::TestByteLevelProcessing::test_processing"
"tests/bindings/test_tokenizer.py::TestTokenizer::test_encode_add_special_tokens"
"tests/bindings/test_tokenizer.py::TestTokenizer::test_from_pretrained"
"tests/bindings/test_tokenizer.py::TestTokenizer::test_from_pretrained_revision"
"tests/bindings/test_tokenizer.py::TestTokenizer::test_encode_special_tokens"
"tests/bindings/test_tokenizer.py::TestTokenizer::test_splitting"
"tests/bindings/test_trainers.py::TestUnigram::test_continuing_prefix_trainer_mistmatch"
"tests/bindings/test_trainers.py::TestUnigram::test_train"
"tests/documentation/test_pipeline.py::TestPipeline::test_pipeline"
"tests/documentation/test_pipeline.py::TestPipeline::test_bert_example"
"tests/documentation/test_quicktour.py::TestQuicktour::test_quicktour"
"tests/documentation/test_tutorial_train_from_iterators.py::TestTrainFromIterators::test_datasets"
"tests/documentation/test_tutorial_train_from_iterators.py::TestTrainFromIterators::test_gzip"
"tests/implementations/test_bert_wordpiece.py::TestBertWordPieceTokenizer::test_basic_encode"
"tests/implementations/test_bert_wordpiece.py::TestBertWordPieceTokenizer::test_multiprocessing_with_parallelism"
"tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_basic_encode"
"tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_add_prefix_space"
"tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_lowerspace"
"tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_multiprocessing_with_parallelism"
"tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_basic_encode"
"tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_lowercase"
"tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_decoding"
"tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_multiprocessing_with_parallelism"
"tests/test_serialization.py::TestSerialization::test_full_serialization_albert"
"tests/test_serialization.py::TestSerialization::test_str_big"
)
local -x EPYTEST_IGNORE=(benches/)
distutils-r1_src_test
}
src_install() {
cd tokenizers
cd ../bindings/python
distutils-r1_src_install
}
test? ( sci-ml/datasets[${PYTHON_SINGLE_USEDEP}] )
$(python_gen_cond_dep '
dev-python/setuptools-rust[${PYTHON_USEDEP}]
')
| Type | File | Size | Source URLs |
|---|---|---|---|
| DIST | tokenizers-0.21.0.gh.tar.gz | 1544853 bytes | https://github.com/huggingface/tokenizers/archive/refs/tags/v0.21.0.tar.gz |