Gentoo Packages

Search

Install

Install this version:

emerge -a =sci-ml/tokenizers-0.23.1

If this version is masked, you can unmask it using the autounmask tool or standard emerge options:

autounmask =sci-ml/tokenizers-0.23.1

Or alternatively:

emerge --autounmask-write -a =sci-ml/tokenizers-0.23.1

Package Information

Description:: Implementation of today's most used tokenizers
Homepage:: https://github.com/huggingface/tokenizers
License:: Apache-2.0 Apache-2.0 Apache-2.0-with-LLVM-exceptions BSD-2 BSD ISC MIT MPL-2.0 Unicode-DFS-2016

Ebuild Details

Version	EAPI	Keywords	Slot
0.23.1	8	~amd64	0

View Raw Ebuild

# Copyright 2023-2026 Gentoo Authors
# Distributed under the terms of the GNU General Public License v2

# Autogenerated by pycargoebuild 0.15.0

EAPI=8

DISTUTILS_USE_PEP517=maturin
PYTHON_COMPAT=( python3_{12..14} )
DISTUTILS_EXT=1
DISTUTILS_SINGLE_IMPL=1
RUST_MIN_VER="1.87.0"

CRATES="
"

inherit cargo distutils-r1

DESCRIPTION="Implementation of today's most used tokenizers"
HOMEPAGE="https://github.com/huggingface/tokenizers"
SRC_URI="
	https://github.com/huggingface/${PN}/archive/refs/tags/v${PV}.tar.gz
	-> ${P}.gh.tar.gz
	${CARGO_CRATE_URIS}
"
if [[ ${PKGBUMPING} != ${PVR} ]]; then
	SRC_URI+="
		https://dev.gentoo.org/~tupone/distfiles/${P}-crates.tar.xz
		https://dev.gentoo.org/~tupone/distfiles/${PN}-python-${PV}-crates.tar.xz
	"
fi

LICENSE="Apache-2.0"
# Dependent crate licenses
LICENSE+="
	Apache-2.0 Apache-2.0-with-LLVM-exceptions BSD-2 BSD ISC MIT MPL-2.0
	Unicode-DFS-2016
"
SLOT="0"
KEYWORDS="~amd64"

RDEPEND="dev-libs/oniguruma"
BDEPEND="
	test? ( sci-ml/datasets[${PYTHON_SINGLE_USEDEP}] )
	$(python_gen_cond_dep '
		dev-python/setuptools-rust[${PYTHON_USEDEP}]
	')
"

EPYTEST_PLUGINS=( )
distutils_enable_tests pytest

QA_FLAGS_IGNORED=".*/site-packages/tokenizers/.*so"

src_unpack() {
	cargo_src_unpack
}

pkg_setup() {
	python-single-r1_pkg_setup
	rust_pkg_setup
}

src_prepare() {
	default
	cd bindings/python
	eapply "${FILESDIR}"/${PN}-0.21.2-test.patch
	distutils-r1_src_prepare
}

src_configure() {
	cd tokenizers
	cargo_src_configure
	cd ../bindings/python
	distutils-r1_src_configure
}

src_compile() {
	export RUSTONIG_SYSTEM_LIBONIG=1
	cd tokenizers
	cargo_src_compile
	cd ../bindings/python
	distutils-r1_src_compile
}

src_test() {
	cd tokenizers
	# Tests do not work
	#cargo_src_test
	cd ../bindings/python
	local -x EPYTEST_IGNORE=( benches/test_tiktoken.py )
	local -x EPYTEST_DESELECT=(
		tests/bindings/test_encoding.py::TestEncoding::test_char_to_token
		tests/bindings/test_encoding.py::TestEncoding::test_char_to_word
		tests/bindings/test_encoding.py::TestEncoding::test_invalid_truncate_direction
		tests/bindings/test_encoding.py::TestEncoding::test_n_sequences
		tests/bindings/test_encoding.py::TestEncoding::test_sequence_ids
		tests/bindings/test_encoding.py::TestEncoding::test_token_to_chars
		tests/bindings/test_encoding.py::TestEncoding::test_token_to_sequence
		tests/bindings/test_encoding.py::TestEncoding::test_token_to_word
		tests/bindings/test_encoding.py::TestEncoding::test_truncation
		tests/bindings/test_encoding.py::TestEncoding::test_word_to_chars
		tests/bindings/test_encoding.py::TestEncoding::test_word_to_tokens
		tests/bindings/test_models.py::TestWordLevel::test_instantiate
		tests/bindings/test_models.py::TestWordPiece::test_instantiate
		tests/bindings/test_processors.py::TestByteLevelProcessing::test_processing
		tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_async_methods_existence
		tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_basic_encoding
		tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_concurrency
		tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_decode
		tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_encode
		tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_error_handling
		tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_large_batch
		tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_numpy_inputs
		tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_various_input_formats
		tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_performance_comparison
		tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_with_special_tokens
		tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_with_truncation_padding
		tests/bindings/test_tokenizer.py::TestTokenizer::test_encode_add_special_tokens
		tests/bindings/test_tokenizer.py::TestTokenizer::test_encode_formats
		tests/bindings/test_tokenizer.py::TestTokenizer::test_encode_special_tokens
		tests/bindings/test_tokenizer.py::TestTokenizer::test_decode_skip_special_tokens
		tests/bindings/test_tokenizer.py::TestTokenizer::test_decode_stream_fallback
		tests/bindings/test_tokenizer.py::TestTokenizer::test_from_pretrained
		tests/bindings/test_tokenizer.py::TestTokenizer::test_from_pretrained_revision
		tests/bindings/test_tokenizer.py::TestTokenizer::test_splitting
		tests/bindings/test_trainers.py::TestUnigram::test_continuing_prefix_trainer_mismatch
		tests/bindings/test_trainers.py::TestUnigram::test_train
		tests/bindings/test_trainers.py::TestUnigram::test_train_parallelism_with_custom_pretokenizer
		tests/documentation/test_pipeline.py::TestPipeline::test_bert_example
		tests/documentation/test_pipeline.py::TestPipeline::test_pipeline
		tests/documentation/test_quicktour.py::TestQuicktour::test_quicktour
		tests/documentation/test_tutorial_train_from_iterators.py::TestTrainFromIterators::test_datasets
		tests/documentation/test_tutorial_train_from_iterators.py::TestTrainFromIterators::test_gzip
		tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_add_prefix_space
		tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_basic_encode
		tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_lowerspace
		tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_multiprocessing_with_parallelism
		tests/implementations/test_bert_wordpiece.py::TestBertWordPieceTokenizer::test_basic_encode
		tests/implementations/test_bert_wordpiece.py::TestBertWordPieceTokenizer::test_multiprocessing_with_parallelism
		tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_basic_encode
		tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_decoding
		tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_lowercase
		tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_multiprocessing_with_parallelism
		tests/test_serialization.py::TestSerialization::test_full_serialization_albert
		tests/test_serialization.py::TestSerialization::test_str_big
	)
	distutils-r1_src_test
}

src_install() {
	cd tokenizers
	cd ../bindings/python
	distutils-r1_src_install
}

Inherited Eclasses

cargo

distutils-r1

Dependencies

RDEPEND

dev-libs/oniguruma

BDEPEND

	test? ( sci-ml/datasets[${PYTHON_SINGLE_USEDEP}] )
	$(python_gen_cond_dep '
		dev-python/setuptools-rust[${PYTHON_USEDEP}]
	')

Manifest for 0.23.1

Type	File	Size	Source URLs
DIST	tokenizers-0.23.1.gh.tar.gz	1613786 bytes	https://github.com/huggingface/tokenizers/archive/refs/tags/v0.23.1.tar.gz

sci-ml/tokenizers - 0.23.1 (gentoo)

Search

Install

Package Information

Ebuild Details

Inherited Eclasses

Dependencies

RDEPEND

BDEPEND

Manifest for 0.23.1