Gentoo Packages

Search

Package Information

Description:: Implementation of today's most used tokenizers
Homepage:: https://github.com/huggingface/tokenizers
License:: Apache-2.0 Apache-2.0-with-LLVM-exceptions BSD-2 BSD ISC MIT MPL-2.0 Unicode-DFS-2016

Ebuild Details

Version	EAPI	Keywords	Slot
0.22.1	8	~amd64	0

View Raw Ebuild

# Copyright 2023-2025 Gentoo Authors
# Distributed under the terms of the GNU General Public License v2

# Autogenerated by pycargoebuild 0.15.0

EAPI=8

DISTUTILS_USE_PEP517=maturin
PYTHON_COMPAT=( python3_{10..13} )
DISTUTILS_EXT=1
DISTUTILS_SINGLE_IMPL=1
RUST_MIN_VER="1.82.0"

CRATES="
"

inherit cargo distutils-r1

DESCRIPTION="Implementation of today's most used tokenizers"
HOMEPAGE="https://github.com/huggingface/tokenizers"
SRC_URI="
	https://github.com/huggingface/${PN}/archive/refs/tags/v${PV}.tar.gz
	-> ${P}.gh.tar.gz
	${CARGO_CRATE_URIS}
"
if [[ ${PKGBUMPING} != ${PVR} ]]; then
	SRC_URI+="
		https://dev.gentoo.org/~tupone/distfiles/${PN}-0.22.0-crates.tar.xz
		https://dev.gentoo.org/~tupone/distfiles/${PN}-python-${PV}-crates.tar.xz
	"
fi

LICENSE="Apache-2.0"
# Dependent crate licenses
LICENSE+="
	Apache-2.0 Apache-2.0-with-LLVM-exceptions BSD-2 BSD ISC MIT MPL-2.0
	Unicode-DFS-2016
"
SLOT="0"
KEYWORDS="~amd64"

RDEPEND="dev-libs/oniguruma"
BDEPEND="
	test? ( sci-ml/datasets[${PYTHON_SINGLE_USEDEP}] )
	$(python_gen_cond_dep '
		dev-python/setuptools-rust[${PYTHON_USEDEP}]
	')
"

distutils_enable_tests pytest

QA_FLAGS_IGNORED=".*/site-packages/tokenizers/.*so"

src_unpack() {
	cargo_src_unpack
}

pkg_setup() {
	python-single-r1_pkg_setup
	rust_pkg_setup
}

src_prepare() {
	default
	cd bindings/python
	eapply "${FILESDIR}"/${PN}-0.21.2-test.patch
	distutils-r1_src_prepare
}

src_configure() {
	cd tokenizers
	cargo_src_configure
	cd ../bindings/python
	distutils-r1_src_configure
}

src_compile() {
	export RUSTONIG_SYSTEM_LIBONIG=1
	cd tokenizers
	cargo_src_compile
	cd ../bindings/python
	distutils-r1_src_compile
}

src_test() {
	cd tokenizers
	# Tests do not work
	#cargo_src_test
	cd ../bindings/python
	local -x EPYTEST_IGNORE=( benches/ )
	local -x EPYTEST_DESELECT=(
		tests/bindings/test_encoding.py::TestEncoding::test_sequence_ids
		tests/bindings/test_encoding.py::TestEncoding::test_n_sequences
		tests/bindings/test_encoding.py::TestEncoding::test_word_to_tokens
		tests/bindings/test_encoding.py::TestEncoding::test_word_to_chars
		tests/bindings/test_encoding.py::TestEncoding::test_token_to_sequence
		tests/bindings/test_encoding.py::TestEncoding::test_token_to_chars
		tests/bindings/test_encoding.py::TestEncoding::test_token_to_word
		tests/bindings/test_encoding.py::TestEncoding::test_char_to_token
		tests/bindings/test_encoding.py::TestEncoding::test_char_to_word
		tests/bindings/test_encoding.py::TestEncoding::test_truncation
		tests/bindings/test_encoding.py::TestEncoding::test_invalid_truncate_direction
		tests/bindings/test_models.py::TestBPE::test_instantiate
		tests/bindings/test_models.py::TestWordLevel::test_instantiate
		tests/bindings/test_models.py::TestWordPiece::test_instantiate
		tests/bindings/test_processors.py::TestByteLevelProcessing::test_processing
		tests/bindings/test_trainers.py::TestUnigram::test_continuing_prefix_trainer_mismatch
		tests/bindings/test_trainers.py::TestUnigram::test_train
		tests/bindings/test_trainers.py::TestUnigram::test_train_parallelism_with_custom_pretokenizer
		tests/documentation/test_pipeline.py::TestPipeline::test_pipeline
		tests/documentation/test_pipeline.py::TestPipeline::test_bert_example
		tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_basic_encode
		tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_lowercase
		tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_decoding
		tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_multiprocessing_with_parallelism
		tests/test_serialization.py::TestSerialization::test_full_serialization_albert
		tests/test_serialization.py::TestSerialization::test_str_big
		tests/bindings/test_tokenizer.py::TestTokenizer::test_encode_formats
		tests/bindings/test_tokenizer.py::TestTokenizer::test_encode_add_special_tokens
		tests/bindings/test_tokenizer.py::TestTokenizer::test_from_pretrained
		tests/bindings/test_tokenizer.py::TestTokenizer::test_from_pretrained_revision
		tests/bindings/test_tokenizer.py::TestTokenizer::test_encode_special_tokens
		tests/bindings/test_tokenizer.py::TestTokenizer::test_splitting
		tests/documentation/test_quicktour.py::TestQuicktour::test_quicktour
		tests/documentation/test_tutorial_train_from_iterators.py::TestTrainFromIterators::test_datasets
		tests/documentation/test_tutorial_train_from_iterators.py::TestTrainFromIterators::test_gzip
		tests/implementations/test_bert_wordpiece.py::TestBertWordPieceTokenizer::test_basic_encode
		tests/implementations/test_bert_wordpiece.py::TestBertWordPieceTokenizer::test_multiprocessing_with_parallelism
		tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_basic_encode
		tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_add_prefix_space
		tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_lowerspace
		tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_multiprocessing_with_parallelism

	)
	distutils-r1_src_test
}

src_install() {
	cd tokenizers
	cd ../bindings/python
	distutils-r1_src_install
}

Dependencies

RDEPEND

dev-libs/oniguruma

BDEPEND

	test? ( sci-ml/datasets[${PYTHON_SINGLE_USEDEP}] )
	$(python_gen_cond_dep '
		dev-python/setuptools-rust[${PYTHON_USEDEP}]
	')

Manifest for 0.22.1

Type	File	Size	Source URLs
DIST	tokenizers-0.22.1.gh.tar.gz	1563505 bytes	https://github.com/huggingface/tokenizers/archive/refs/tags/v0.22.1.tar.gz

sci-ml/tokenizers - 0.22.1 (gentoo)

Search

Package Information

Ebuild Details

Dependencies

RDEPEND

BDEPEND

Manifest for 0.22.1