Mock Version: 2.16 ENTER ['do_with_status'](['bash', '--login', '-c', '/usr/bin/rpmbuild -bs --target x86_64 --nodeps /builddir/build/SPECS/python-pystempel.spec'], chrootPath='/var/lib/mock/centos-stream+epel-9-x86_64-1647767947.508400/root'env={'TERM': 'vt100', 'SHELL': '/bin/bash', 'HOME': '/builddir', 'HOSTNAME': 'mock', 'PATH': '/usr/bin:/bin:/usr/sbin:/sbin', 'PROMPT_COMMAND': 'printf "\\033]0;\\007"', 'PS1': ' \\s-\\v\\$ ', 'LANG': 'C.UTF-8'}shell=Falselogger=timeout=0uid=1000gid=135user='mockbuild'nspawn_args=['--capability=cap_ipc_lock', '--rlimit=RLIMIT_NOFILE=10240', '--capability=cap_ipc_lock', '--bind=/tmp/mock-resolv.jsyf8alv:/etc/resolv.conf', '--bind=/dev/btrfs-control', '--bind=/dev/loop-control', '--bind=/dev/loop0', '--bind=/dev/loop1', '--bind=/dev/loop2', '--bind=/dev/loop3', '--bind=/dev/loop4', '--bind=/dev/loop5', '--bind=/dev/loop6', '--bind=/dev/loop7', '--bind=/dev/loop8', '--bind=/dev/loop9', '--bind=/dev/loop10', '--bind=/dev/loop11']unshare_net=TrueprintOutput=True) Using nspawn with args ['--capability=cap_ipc_lock', '--rlimit=RLIMIT_NOFILE=10240', '--capability=cap_ipc_lock', '--bind=/tmp/mock-resolv.jsyf8alv:/etc/resolv.conf', '--bind=/dev/btrfs-control', '--bind=/dev/loop-control', '--bind=/dev/loop0', '--bind=/dev/loop1', '--bind=/dev/loop2', '--bind=/dev/loop3', '--bind=/dev/loop4', '--bind=/dev/loop5', '--bind=/dev/loop6', '--bind=/dev/loop7', '--bind=/dev/loop8', '--bind=/dev/loop9', '--bind=/dev/loop10', '--bind=/dev/loop11'] Executing command: ['/usr/bin/systemd-nspawn', '-q', '-M', '994d77c8807e467b8f3c20d0c27081c9', '-D', '/var/lib/mock/centos-stream+epel-9-x86_64-1647767947.508400/root', '-a', '-u', 'mockbuild', '--capability=cap_ipc_lock', '--rlimit=RLIMIT_NOFILE=10240', '--capability=cap_ipc_lock', '--bind=/tmp/mock-resolv.jsyf8alv:/etc/resolv.conf', '--bind=/dev/btrfs-control', '--bind=/dev/loop-control', '--bind=/dev/loop0', '--bind=/dev/loop1', '--bind=/dev/loop2', '--bind=/dev/loop3', '--bind=/dev/loop4', '--bind=/dev/loop5', '--bind=/dev/loop6', '--bind=/dev/loop7', '--bind=/dev/loop8', '--bind=/dev/loop9', '--bind=/dev/loop10', '--bind=/dev/loop11', '--console=pipe', '--setenv=TERM=vt100', '--setenv=SHELL=/bin/bash', '--setenv=HOME=/builddir', '--setenv=HOSTNAME=mock', '--setenv=PATH=/usr/bin:/bin:/usr/sbin:/sbin', '--setenv=PROMPT_COMMAND=printf "\\033]0;\\007"', '--setenv=PS1= \\s-\\v\\$ ', '--setenv=LANG=C.UTF-8', '--resolv-conf=off', 'bash', '--login', '-c', '/usr/bin/rpmbuild -bs --target x86_64 --nodeps /builddir/build/SPECS/python-pystempel.spec'] with env {'TERM': 'vt100', 'SHELL': '/bin/bash', 'HOME': '/builddir', 'HOSTNAME': 'mock', 'PATH': '/usr/bin:/bin:/usr/sbin:/sbin', 'PROMPT_COMMAND': 'printf "\\033]0;\\007"', 'PS1': ' \\s-\\v\\$ ', 'LANG': 'C.UTF-8', 'SYSTEMD_NSPAWN_TMPFS_TMP': '0', 'SYSTEMD_SECCOMP': '0'} and shell False Building target platforms: x86_64 Building for target x86_64 setting SOURCE_DATE_EPOCH=1647648000 Wrote: /builddir/build/SRPMS/python-pystempel-1.2.0-1.el9.src.rpm Child return code was: 0 ENTER ['do_with_status'](['bash', '--login', '-c', '/usr/bin/rpmbuild -br --target x86_64 --nodeps /builddir/build/SPECS/python-pystempel.spec'], chrootPath='/var/lib/mock/centos-stream+epel-9-x86_64-1647767947.508400/root'env={'TERM': 'vt100', 'SHELL': '/bin/bash', 'HOME': '/builddir', 'HOSTNAME': 'mock', 'PATH': '/usr/bin:/bin:/usr/sbin:/sbin', 'PROMPT_COMMAND': 'printf "\\033]0;\\007"', 'PS1': ' \\s-\\v\\$ ', 'LANG': 'C.UTF-8'}shell=Falselogger=timeout=0uid=1000gid=135user='mockbuild'nspawn_args=['--capability=cap_ipc_lock', '--rlimit=RLIMIT_NOFILE=10240', '--capability=cap_ipc_lock', '--bind=/tmp/mock-resolv.jsyf8alv:/etc/resolv.conf', '--bind=/dev/btrfs-control', '--bind=/dev/loop-control', '--bind=/dev/loop0', '--bind=/dev/loop1', '--bind=/dev/loop2', '--bind=/dev/loop3', '--bind=/dev/loop4', '--bind=/dev/loop5', '--bind=/dev/loop6', '--bind=/dev/loop7', '--bind=/dev/loop8', '--bind=/dev/loop9', '--bind=/dev/loop10', '--bind=/dev/loop11']unshare_net=TrueraiseExc=FalseprintOutput=True) Using nspawn with args ['--capability=cap_ipc_lock', '--rlimit=RLIMIT_NOFILE=10240', '--capability=cap_ipc_lock', '--bind=/tmp/mock-resolv.jsyf8alv:/etc/resolv.conf', '--bind=/dev/btrfs-control', '--bind=/dev/loop-control', '--bind=/dev/loop0', '--bind=/dev/loop1', '--bind=/dev/loop2', '--bind=/dev/loop3', '--bind=/dev/loop4', '--bind=/dev/loop5', '--bind=/dev/loop6', '--bind=/dev/loop7', '--bind=/dev/loop8', '--bind=/dev/loop9', '--bind=/dev/loop10', '--bind=/dev/loop11'] Executing command: ['/usr/bin/systemd-nspawn', '-q', '-M', '2a3c55faf86e4ef5b80b7792ab9b1e6e', '-D', '/var/lib/mock/centos-stream+epel-9-x86_64-1647767947.508400/root', '-a', '-u', 'mockbuild', '--capability=cap_ipc_lock', '--rlimit=RLIMIT_NOFILE=10240', '--capability=cap_ipc_lock', '--bind=/tmp/mock-resolv.jsyf8alv:/etc/resolv.conf', '--bind=/dev/btrfs-control', '--bind=/dev/loop-control', '--bind=/dev/loop0', '--bind=/dev/loop1', '--bind=/dev/loop2', '--bind=/dev/loop3', '--bind=/dev/loop4', '--bind=/dev/loop5', '--bind=/dev/loop6', '--bind=/dev/loop7', '--bind=/dev/loop8', '--bind=/dev/loop9', '--bind=/dev/loop10', '--bind=/dev/loop11', '--console=pipe', '--setenv=TERM=vt100', '--setenv=SHELL=/bin/bash', '--setenv=HOME=/builddir', '--setenv=HOSTNAME=mock', '--setenv=PATH=/usr/bin:/bin:/usr/sbin:/sbin', '--setenv=PROMPT_COMMAND=printf "\\033]0;\\007"', '--setenv=PS1= \\s-\\v\\$ ', '--setenv=LANG=C.UTF-8', '--resolv-conf=off', 'bash', '--login', '-c', '/usr/bin/rpmbuild -br --target x86_64 --nodeps /builddir/build/SPECS/python-pystempel.spec'] with env {'TERM': 'vt100', 'SHELL': '/bin/bash', 'HOME': '/builddir', 'HOSTNAME': 'mock', 'PATH': '/usr/bin:/bin:/usr/sbin:/sbin', 'PROMPT_COMMAND': 'printf "\\033]0;\\007"', 'PS1': ' \\s-\\v\\$ ', 'LANG': 'C.UTF-8', 'SYSTEMD_NSPAWN_TMPFS_TMP': '0', 'SYSTEMD_SECCOMP': '0'} and shell False Building target platforms: x86_64 Building for target x86_64 setting SOURCE_DATE_EPOCH=1647648000 Executing(%prep): /bin/sh -e /var/tmp/rpm-tmp.tqqTm9 + umask 022 + cd /builddir/build/BUILD + cd /builddir/build/BUILD + rm -rf pystempel-1.2.0 + /usr/bin/gzip -dc /builddir/build/SOURCES/pystempel-1.2.0.tar.gz + /usr/bin/tar -xof - + STATUS=0 + '[' 0 -ne 0 ']' + cd pystempel-1.2.0 + /usr/bin/chmod -Rf a+rX,u+w,g-w,o-w . + RPM_EC=0 ++ jobs -p + exit 0 Executing(%generate_buildrequires): /bin/sh -e /var/tmp/rpm-tmp.eubTyj + umask 022 + cd /builddir/build/BUILD + cd pystempel-1.2.0 + echo pyproject-rpm-macros + echo python3-devel + echo 'python3dist(pip) >= 19' + echo 'python3dist(packaging)' + '[' -f pyproject.toml ']' + '[' -f setup.py ']' + echo 'python3dist(setuptools) >= 40.8' + echo 'python3dist(wheel)' + rm -rfv '*.dist-info/' + '[' -f /usr/bin/python3 ']' + RPM_TOXENV=py39 + HOSTNAME=rpmbuild + /usr/bin/python3 -s /usr/lib/rpm/redhat/pyproject_buildrequires.py --generate-extras --python3_pkgversion 3 -r Handling setuptools >= 40.8 from default build backend Requirement satisfied: setuptools >= 40.8 (installed: setuptools 53.0.0) Handling wheel from default build backend Requirement not satisfied: wheel Exiting dependency generation pass: build backend + RPM_EC=0 ++ jobs -p + exit 0 Wrote: /builddir/build/SRPMS/python-pystempel-1.2.0-1.el9.buildreqs.nosrc.rpm Child return code was: 11 Dynamic buildrequires detected Going to install missing buildrequires. See root.log for details. ENTER ['do_with_status'](['bash', '--login', '-c', '/usr/bin/rpmbuild -br --target x86_64 --nodeps /builddir/build/SPECS/python-pystempel.spec'], chrootPath='/var/lib/mock/centos-stream+epel-9-x86_64-1647767947.508400/root'env={'TERM': 'vt100', 'SHELL': '/bin/bash', 'HOME': '/builddir', 'HOSTNAME': 'mock', 'PATH': '/usr/bin:/bin:/usr/sbin:/sbin', 'PROMPT_COMMAND': 'printf "\\033]0;\\007"', 'PS1': ' \\s-\\v\\$ ', 'LANG': 'C.UTF-8'}shell=Falselogger=timeout=0uid=1000gid=135user='mockbuild'nspawn_args=['--capability=cap_ipc_lock', '--rlimit=RLIMIT_NOFILE=10240', '--capability=cap_ipc_lock', '--bind=/tmp/mock-resolv.jsyf8alv:/etc/resolv.conf', '--bind=/dev/btrfs-control', '--bind=/dev/loop-control', '--bind=/dev/loop0', '--bind=/dev/loop1', '--bind=/dev/loop2', '--bind=/dev/loop3', '--bind=/dev/loop4', '--bind=/dev/loop5', '--bind=/dev/loop6', '--bind=/dev/loop7', '--bind=/dev/loop8', '--bind=/dev/loop9', '--bind=/dev/loop10', '--bind=/dev/loop11']unshare_net=TrueraiseExc=FalseprintOutput=True) Using nspawn with args ['--capability=cap_ipc_lock', '--rlimit=RLIMIT_NOFILE=10240', '--capability=cap_ipc_lock', '--bind=/tmp/mock-resolv.jsyf8alv:/etc/resolv.conf', '--bind=/dev/btrfs-control', '--bind=/dev/loop-control', '--bind=/dev/loop0', '--bind=/dev/loop1', '--bind=/dev/loop2', '--bind=/dev/loop3', '--bind=/dev/loop4', '--bind=/dev/loop5', '--bind=/dev/loop6', '--bind=/dev/loop7', '--bind=/dev/loop8', '--bind=/dev/loop9', '--bind=/dev/loop10', '--bind=/dev/loop11'] Executing command: ['/usr/bin/systemd-nspawn', '-q', '-M', 'e4d71ed99d654a91a3d7c4b892243ebe', '-D', '/var/lib/mock/centos-stream+epel-9-x86_64-1647767947.508400/root', '-a', '-u', 'mockbuild', '--capability=cap_ipc_lock', '--rlimit=RLIMIT_NOFILE=10240', '--capability=cap_ipc_lock', '--bind=/tmp/mock-resolv.jsyf8alv:/etc/resolv.conf', '--bind=/dev/btrfs-control', '--bind=/dev/loop-control', '--bind=/dev/loop0', '--bind=/dev/loop1', '--bind=/dev/loop2', '--bind=/dev/loop3', '--bind=/dev/loop4', '--bind=/dev/loop5', '--bind=/dev/loop6', '--bind=/dev/loop7', '--bind=/dev/loop8', '--bind=/dev/loop9', '--bind=/dev/loop10', '--bind=/dev/loop11', '--console=pipe', '--setenv=TERM=vt100', '--setenv=SHELL=/bin/bash', '--setenv=HOME=/builddir', '--setenv=HOSTNAME=mock', '--setenv=PATH=/usr/bin:/bin:/usr/sbin:/sbin', '--setenv=PROMPT_COMMAND=printf "\\033]0;\\007"', '--setenv=PS1= \\s-\\v\\$ ', '--setenv=LANG=C.UTF-8', '--resolv-conf=off', 'bash', '--login', '-c', '/usr/bin/rpmbuild -br --target x86_64 --nodeps /builddir/build/SPECS/python-pystempel.spec'] with env {'TERM': 'vt100', 'SHELL': '/bin/bash', 'HOME': '/builddir', 'HOSTNAME': 'mock', 'PATH': '/usr/bin:/bin:/usr/sbin:/sbin', 'PROMPT_COMMAND': 'printf "\\033]0;\\007"', 'PS1': ' \\s-\\v\\$ ', 'LANG': 'C.UTF-8', 'SYSTEMD_NSPAWN_TMPFS_TMP': '0', 'SYSTEMD_SECCOMP': '0'} and shell False Building target platforms: x86_64 Building for target x86_64 setting SOURCE_DATE_EPOCH=1647648000 Executing(%prep): /bin/sh -e /var/tmp/rpm-tmp.WaHqDP + umask 022 + cd /builddir/build/BUILD + cd /builddir/build/BUILD + rm -rf pystempel-1.2.0 + /usr/bin/gzip -dc /builddir/build/SOURCES/pystempel-1.2.0.tar.gz + /usr/bin/tar -xof - + STATUS=0 + '[' 0 -ne 0 ']' + cd pystempel-1.2.0 + /usr/bin/chmod -Rf a+rX,u+w,g-w,o-w . + RPM_EC=0 ++ jobs -p + exit 0 Executing(%generate_buildrequires): /bin/sh -e /var/tmp/rpm-tmp.aG6UXR + umask 022 + cd /builddir/build/BUILD + cd pystempel-1.2.0 + echo pyproject-rpm-macros + echo python3-devel + echo 'python3dist(pip) >= 19' + echo 'python3dist(packaging)' + '[' -f pyproject.toml ']' + '[' -f setup.py ']' + echo 'python3dist(setuptools) >= 40.8' + echo 'python3dist(wheel)' + rm -rfv '*.dist-info/' + '[' -f /usr/bin/python3 ']' + RPM_TOXENV=py39 + HOSTNAME=rpmbuild + /usr/bin/python3 -s /usr/lib/rpm/redhat/pyproject_buildrequires.py --generate-extras --python3_pkgversion 3 -r Handling setuptools >= 40.8 from default build backend Requirement satisfied: setuptools >= 40.8 (installed: setuptools 53.0.0) Handling wheel from default build backend Requirement satisfied: wheel (installed: wheel 0.36.2) HOOK STDOUT: Stempel Stemmer HOOK STDOUT: =============== HOOK STDOUT: HOOK STDOUT: .. image:: https://badge.fury.io/py/pystempel.svg HOOK STDOUT: :target: https://badge.fury.io/py/pystempel HOOK STDOUT: HOOK STDOUT: Python port of Stempel, an algorithmic stemmer for Polish language, originally written in Java. HOOK STDOUT: HOOK STDOUT: The original stemmer has been implemented as part of `Egothor Project`_, taken virtually unchanged to HOOK STDOUT: `Stempel Stemmer Java library`_ by Andrzej Białecki and next included as part of `Apache Lucene`_, HOOK STDOUT: a free and open-source search engine library. It is also used by `Elastic Search`_ search engine. HOOK STDOUT: HOOK STDOUT: .. _Egothor Project: https://www.egothor.org/product/egothor2/ HOOK STDOUT: .. _Stempel Stemmer Java library: http://www.getopt.org/stempel/index.html HOOK STDOUT: .. _Apache Lucene: https://lucene.apache.org/core/3_1_0/api/contrib-stempel/index.html HOOK STDOUT: .. _Elastic Search: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-stempel.html HOOK STDOUT: HOOK STDOUT: This package includes also high-quality stemming tables for Polish: original one pretrained by HOOK STDOUT: Andrzej Białecki on 20,000 training sets, and new one, pretrained on 259,080 training sets HOOK STDOUT: from Polimorf dictionary by me. HOOK STDOUT: HOOK STDOUT: The port does not include code for compiling stemming tables. HOOK STDOUT: HOOK STDOUT: .. _sjp.pl: https://sjp.pl/slownik/en/ HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: How to use HOOK STDOUT: ---------- HOOK STDOUT: HOOK STDOUT: Install in your local environment: HOOK STDOUT: HOOK STDOUT: .. code:: console HOOK STDOUT: HOOK STDOUT: pip install pystempel HOOK STDOUT: HOOK STDOUT: Use in your code: HOOK STDOUT: HOOK STDOUT: .. code:: python HOOK STDOUT: HOOK STDOUT: >>> from stempel import StempelStemmer HOOK STDOUT: HOOK STDOUT: Choose original (called default) version of a stemmer: HOOK STDOUT: HOOK STDOUT: .. code:: python HOOK STDOUT: HOOK STDOUT: >>> stemmer = StempelStemmer.default() HOOK STDOUT: HOOK STDOUT: or a version with new stemming table pretrained on training sets from Polimorf dictionary: HOOK STDOUT: HOOK STDOUT: .. code:: python HOOK STDOUT: HOOK STDOUT: >>> stemmer = StempelStemmer.polimorf() HOOK STDOUT: HOOK STDOUT: Stem: HOOK STDOUT: HOOK STDOUT: .. code:: python HOOK STDOUT: HOOK STDOUT: >>> for word in ['książka', 'książki', 'książkami', 'książkowa', 'książkowymi']: HOOK STDOUT: ... print(stemmer.stem(word)) HOOK STDOUT: ... HOOK STDOUT: książek HOOK STDOUT: książek HOOK STDOUT: książek HOOK STDOUT: książkowy HOOK STDOUT: książkowy HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: Choosing stemming table HOOK STDOUT: ----------------------- HOOK STDOUT: HOOK STDOUT: Performance between original (default) and new stemming table (Polimorf-based) varies significantly. HOOK STDOUT: The stemmer for the default stemming table is *understemming*, i.e., for multiple forms of the HOOK STDOUT: same lemma provides different stems more often (63%) than when using Polimorf-based stemming table HOOK STDOUT: (13%). However, the file footprint of the latter is bigger (2.2MB vs 0.3MB). Also loading takes HOOK STDOUT: longer (7.5 seconds vs. 1.3 seconds), though this happens only once, when a stemmer is created. Also, HOOK STDOUT: for original stemming table, the stemmer stems slightly faster: ~60000 vs ~51000 words per second. HOOK STDOUT: See `Evaluation Jupyter Notebook`_ for the detailed evaluation results. HOOK STDOUT: HOOK STDOUT: .. _Evaluation Jupyter Notebook: http://htmlpreview.github.io/?https://github.com/dzieciou/pystempel/blob/master/Evaluation.html HOOK STDOUT: HOOK STDOUT: Note also, that the licensing schema of both stemming tables differs, and hence licensing of HOOK STDOUT: data generated with each one. See "Licensing" section for the details. HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: Choosing between port and wrapper HOOK STDOUT: --------------------------------- HOOK STDOUT: HOOK STDOUT: If you work on an NLP project in Python you can choose between Python port and Python wrapper. HOOK STDOUT: Python port is what pystempel tries to achieve: translation from Java implementation to Python. HOOK STDOUT: Python wrapper is what I used in `tests`_: Python functions to call the original Java implementation of HOOK STDOUT: stemmer. You can find more about wrappers and ports in `Stackoverflow comparision post`_. Here, I HOOK STDOUT: compare both approaches to help you decide: HOOK STDOUT: HOOK STDOUT: * **Same accuracy**. I have verified Python port by comparing its output HOOK STDOUT: with output of original Java implementation for 331224 words from Free Polish dictionary HOOK STDOUT: (`sjp.pl`_) and for 100% of words it returns same output. HOOK STDOUT: * **Similar performance**. For mentioned dataset both stemmer versions achieved comparable performance. HOOK STDOUT: Python port completed stemming in 4.4 seconds, while Python wrapper -- in 5 seconds (Intel Core HOOK STDOUT: i5-6000 3.30 GHz, 16GB RAM, Windows 10, OpenJDK) HOOK STDOUT: * **Different setup**. Python wrapper requires additionally installation of Cython and pyjnius. HOOK STDOUT: Python wrapper will make also `debugging harder`_ (switching between two programming languages). HOOK STDOUT: HOOK STDOUT: .. _Stackoverflow comparision post: https://stackoverflow.com/questions/10113218/how-to-decide-when-to-wrap-port-write-from-scratch HOOK STDOUT: .. _debugging harder: https://stackoverflow.com/questions/6970359/find-an-efficient-way-to-integrate-different-language-libraries-into-one-project HOOK STDOUT: .. _tests: tests/ HOOK STDOUT: HOOK STDOUT: Options HOOK STDOUT: ------- HOOK STDOUT: HOOK STDOUT: To disable a progress bar when loading stemming tables, set environment variable ``DISABLE_TQDM=True``. HOOK STDOUT: HOOK STDOUT: Development setup HOOK STDOUT: ----------------- HOOK STDOUT: HOOK STDOUT: To setup environment for development you will need `Anaconda`_ installed. HOOK STDOUT: HOOK STDOUT: .. _Anaconda: https://anaconda.org/ HOOK STDOUT: HOOK STDOUT: .. code:: console HOOK STDOUT: HOOK STDOUT: conda env create --file environment.yml HOOK STDOUT: conda activate pystempel-env HOOK STDOUT: pre-commit install HOOK STDOUT: HOOK STDOUT: To run tests: HOOK STDOUT: HOOK STDOUT: .. code:: console HOOK STDOUT: HOOK STDOUT: curl https://repo1.maven.org/maven2/org/apache/lucene/lucene-analyzers-stempel/8.1.1/lucene-analyzers-stempel-8.1.1.jar > stempel-8.1.1.jar HOOK STDOUT: python -m pytest ./ HOOK STDOUT: HOOK STDOUT: To run benchmark: HOOK STDOUT: HOOK STDOUT: .. code:: console HOOK STDOUT: HOOK STDOUT: set PYTHONPATH=%PYTHONPATH%;%cd% HOOK STDOUT: python tests\test_benchmark.py HOOK STDOUT: HOOK STDOUT: Licensing HOOK STDOUT: --------- HOOK STDOUT: HOOK STDOUT: * **Code**: Most of the code is covered by `Egothor`_ Open Source License, an Apache-style license. HOOK STDOUT: The rest of the code is covered by the `Apache License 2.0`_. This should be clear from a preamble HOOK STDOUT: of each file. HOOK STDOUT: HOOK STDOUT: * **Data**: HOOK STDOUT: HOOK STDOUT: * The original pretrained stemming table is covered by `Apache License 2.0`_. HOOK STDOUT: HOOK STDOUT: * The new pretrained stemming table is covered by `2-Clause BSD License`_, similarly to the HOOK STDOUT: `Polimorf dictionary` it has been derived from. The copyright owner of both the stemming table HOOK STDOUT: and the dictionary is `Institute of Computer Science at Polish Academy of Science`_ (IPI PAN). HOOK STDOUT: HOOK STDOUT: * Polish dictionary used by the unit tests comes from `sjp.pl`_ and is covered by HOOK STDOUT: `Apache License 2.0`_ as well. HOOK STDOUT: HOOK STDOUT: .. _Egothor: https://www.egothor.org/product/egothor2/ HOOK STDOUT: .. _Apache License 2.0: https://www.apache.org/licenses/LICENSE-2.0 HOOK STDOUT: .. _Polimorf dictionary: dicts/ HOOK STDOUT: .. _2-Clause BSD License: data/polimorf/LICENSE.txt HOOK STDOUT: .. _Institute of Computer Science at Polish Academy of Science: https://ipipan.waw.pl/en/ HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: Alternatives HOOK STDOUT: ------------ HOOK STDOUT: HOOK STDOUT: * `Estem`_ is Erlang wrapper (not port) for Stempel stemmer. HOOK STDOUT: * `pl_stemmer`_ is a Python stemmer based on Porter's Algorithm. HOOK STDOUT: * `polish-stem`_ is a Python stemmer using Finite State Transducers. HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: .. _Estem: https://github.com/arcusfelis/estem HOOK STDOUT: .. _pl_stemmer: https://github.com/Tutanchamon/pl_stemmer HOOK STDOUT: .. _polish-stem: https://github.com/eugeniashurko/polish-stem HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: running egg_info HOOK STDOUT: writing pystempel.egg-info/PKG-INFO HOOK STDOUT: writing dependency_links to pystempel.egg-info/dependency_links.txt HOOK STDOUT: writing requirements to pystempel.egg-info/requires.txt HOOK STDOUT: writing top-level names to pystempel.egg-info/top_level.txt HOOK STDOUT: reading manifest file 'pystempel.egg-info/SOURCES.txt' HOOK STDOUT: writing manifest file 'pystempel.egg-info/SOURCES.txt' Handling wheel from get_requires_for_build_wheel Requirement satisfied: wheel (installed: wheel 0.36.2) HOOK STDOUT: Stempel Stemmer HOOK STDOUT: =============== HOOK STDOUT: HOOK STDOUT: .. image:: https://badge.fury.io/py/pystempel.svg HOOK STDOUT: :target: https://badge.fury.io/py/pystempel HOOK STDOUT: HOOK STDOUT: Python port of Stempel, an algorithmic stemmer for Polish language, originally written in Java. HOOK STDOUT: HOOK STDOUT: The original stemmer has been implemented as part of `Egothor Project`_, taken virtually unchanged to HOOK STDOUT: `Stempel Stemmer Java library`_ by Andrzej Białecki and next included as part of `Apache Lucene`_, HOOK STDOUT: a free and open-source search engine library. It is also used by `Elastic Search`_ search engine. HOOK STDOUT: HOOK STDOUT: .. _Egothor Project: https://www.egothor.org/product/egothor2/ HOOK STDOUT: .. _Stempel Stemmer Java library: http://www.getopt.org/stempel/index.html HOOK STDOUT: .. _Apache Lucene: https://lucene.apache.org/core/3_1_0/api/contrib-stempel/index.html HOOK STDOUT: .. _Elastic Search: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-stempel.html HOOK STDOUT: HOOK STDOUT: This package includes also high-quality stemming tables for Polish: original one pretrained by HOOK STDOUT: Andrzej Białecki on 20,000 training sets, and new one, pretrained on 259,080 training sets HOOK STDOUT: from Polimorf dictionary by me. HOOK STDOUT: HOOK STDOUT: The port does not include code for compiling stemming tables. HOOK STDOUT: HOOK STDOUT: .. _sjp.pl: https://sjp.pl/slownik/en/ HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: How to use HOOK STDOUT: ---------- HOOK STDOUT: HOOK STDOUT: Install in your local environment: HOOK STDOUT: HOOK STDOUT: .. code:: console HOOK STDOUT: HOOK STDOUT: pip install pystempel HOOK STDOUT: HOOK STDOUT: Use in your code: HOOK STDOUT: HOOK STDOUT: .. code:: python HOOK STDOUT: HOOK STDOUT: >>> from stempel import StempelStemmer HOOK STDOUT: HOOK STDOUT: Choose original (called default) version of a stemmer: HOOK STDOUT: HOOK STDOUT: .. code:: python HOOK STDOUT: HOOK STDOUT: >>> stemmer = StempelStemmer.default() HOOK STDOUT: HOOK STDOUT: or a version with new stemming table pretrained on training sets from Polimorf dictionary: HOOK STDOUT: HOOK STDOUT: .. code:: python HOOK STDOUT: HOOK STDOUT: >>> stemmer = StempelStemmer.polimorf() HOOK STDOUT: HOOK STDOUT: Stem: HOOK STDOUT: HOOK STDOUT: .. code:: python HOOK STDOUT: HOOK STDOUT: >>> for word in ['książka', 'książki', 'książkami', 'książkowa', 'książkowymi']: HOOK STDOUT: ... print(stemmer.stem(word)) HOOK STDOUT: ... HOOK STDOUT: książek HOOK STDOUT: książek HOOK STDOUT: książek HOOK STDOUT: książkowy HOOK STDOUT: książkowy HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: Choosing stemming table HOOK STDOUT: ----------------------- HOOK STDOUT: HOOK STDOUT: Performance between original (default) and new stemming table (Polimorf-based) varies significantly. HOOK STDOUT: The stemmer for the default stemming table is *understemming*, i.e., for multiple forms of the HOOK STDOUT: same lemma provides different stems more often (63%) than when using Polimorf-based stemming table HOOK STDOUT: (13%). However, the file footprint of the latter is bigger (2.2MB vs 0.3MB). Also loading takes HOOK STDOUT: longer (7.5 seconds vs. 1.3 seconds), though this happens only once, when a stemmer is created. Also, HOOK STDOUT: for original stemming table, the stemmer stems slightly faster: ~60000 vs ~51000 words per second. HOOK STDOUT: See `Evaluation Jupyter Notebook`_ for the detailed evaluation results. HOOK STDOUT: HOOK STDOUT: .. _Evaluation Jupyter Notebook: http://htmlpreview.github.io/?https://github.com/dzieciou/pystempel/blob/master/Evaluation.html HOOK STDOUT: HOOK STDOUT: Note also, that the licensing schema of both stemming tables differs, and hence licensing of HOOK STDOUT: data generated with each one. See "Licensing" section for the details. HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: Choosing between port and wrapper HOOK STDOUT: --------------------------------- HOOK STDOUT: HOOK STDOUT: If you work on an NLP project in Python you can choose between Python port and Python wrapper. HOOK STDOUT: Python port is what pystempel tries to achieve: translation from Java implementation to Python. HOOK STDOUT: Python wrapper is what I used in `tests`_: Python functions to call the original Java implementation of HOOK STDOUT: stemmer. You can find more about wrappers and ports in `Stackoverflow comparision post`_. Here, I HOOK STDOUT: compare both approaches to help you decide: HOOK STDOUT: HOOK STDOUT: * **Same accuracy**. I have verified Python port by comparing its output HOOK STDOUT: with output of original Java implementation for 331224 words from Free Polish dictionary HOOK STDOUT: (`sjp.pl`_) and for 100% of words it returns same output. HOOK STDOUT: * **Similar performance**. For mentioned dataset both stemmer versions achieved comparable performance. HOOK STDOUT: Python port completed stemming in 4.4 seconds, while Python wrapper -- in 5 seconds (Intel Core HOOK STDOUT: i5-6000 3.30 GHz, 16GB RAM, Windows 10, OpenJDK) HOOK STDOUT: * **Different setup**. Python wrapper requires additionally installation of Cython and pyjnius. HOOK STDOUT: Python wrapper will make also `debugging harder`_ (switching between two programming languages). HOOK STDOUT: HOOK STDOUT: .. _Stackoverflow comparision post: https://stackoverflow.com/questions/10113218/how-to-decide-when-to-wrap-port-write-from-scratch HOOK STDOUT: .. _debugging harder: https://stackoverflow.com/questions/6970359/find-an-efficient-way-to-integrate-different-language-libraries-into-one-project HOOK STDOUT: .. _tests: tests/ HOOK STDOUT: HOOK STDOUT: Options HOOK STDOUT: ------- HOOK STDOUT: HOOK STDOUT: To disable a progress bar when loading stemming tables, set environment variable ``DISABLE_TQDM=True``. HOOK STDOUT: HOOK STDOUT: Development setup HOOK STDOUT: ----------------- HOOK STDOUT: HOOK STDOUT: To setup environment for development you will need `Anaconda`_ installed. HOOK STDOUT: HOOK STDOUT: .. _Anaconda: https://anaconda.org/ HOOK STDOUT: HOOK STDOUT: .. code:: console HOOK STDOUT: HOOK STDOUT: conda env create --file environment.yml HOOK STDOUT: conda activate pystempel-env HOOK STDOUT: pre-commit install HOOK STDOUT: HOOK STDOUT: To run tests: HOOK STDOUT: HOOK STDOUT: .. code:: console HOOK STDOUT: HOOK STDOUT: curl https://repo1.maven.org/maven2/org/apache/lucene/lucene-analyzers-stempel/8.1.1/lucene-analyzers-stempel-8.1.1.jar > stempel-8.1.1.jar HOOK STDOUT: python -m pytest ./ HOOK STDOUT: HOOK STDOUT: To run benchmark: HOOK STDOUT: HOOK STDOUT: .. code:: console HOOK STDOUT: HOOK STDOUT: set PYTHONPATH=%PYTHONPATH%;%cd% HOOK STDOUT: python tests\test_benchmark.py HOOK STDOUT: HOOK STDOUT: Licensing HOOK STDOUT: --------- HOOK STDOUT: HOOK STDOUT: * **Code**: Most of the code is covered by `Egothor`_ Open Source License, an Apache-style license. HOOK STDOUT: The rest of the code is covered by the `Apache License 2.0`_. This should be clear from a preamble HOOK STDOUT: of each file. HOOK STDOUT: HOOK STDOUT: * **Data**: HOOK STDOUT: HOOK STDOUT: * The original pretrained stemming table is covered by `Apache License 2.0`_. HOOK STDOUT: HOOK STDOUT: * The new pretrained stemming table is covered by `2-Clause BSD License`_, similarly to the HOOK STDOUT: `Polimorf dictionary` it has been derived from. The copyright owner of both the stemming table HOOK STDOUT: and the dictionary is `Institute of Computer Science at Polish Academy of Science`_ (IPI PAN). HOOK STDOUT: HOOK STDOUT: * Polish dictionary used by the unit tests comes from `sjp.pl`_ and is covered by HOOK STDOUT: `Apache License 2.0`_ as well. HOOK STDOUT: HOOK STDOUT: .. _Egothor: https://www.egothor.org/product/egothor2/ HOOK STDOUT: .. _Apache License 2.0: https://www.apache.org/licenses/LICENSE-2.0 HOOK STDOUT: .. _Polimorf dictionary: dicts/ HOOK STDOUT: .. _2-Clause BSD License: data/polimorf/LICENSE.txt HOOK STDOUT: .. _Institute of Computer Science at Polish Academy of Science: https://ipipan.waw.pl/en/ HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: Alternatives HOOK STDOUT: ------------ HOOK STDOUT: HOOK STDOUT: * `Estem`_ is Erlang wrapper (not port) for Stempel stemmer. HOOK STDOUT: * `pl_stemmer`_ is a Python stemmer based on Porter's Algorithm. HOOK STDOUT: * `polish-stem`_ is a Python stemmer using Finite State Transducers. HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: .. _Estem: https://github.com/arcusfelis/estem HOOK STDOUT: .. _pl_stemmer: https://github.com/Tutanchamon/pl_stemmer HOOK STDOUT: .. _polish-stem: https://github.com/eugeniashurko/polish-stem HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: running dist_info HOOK STDOUT: writing pystempel.egg-info/PKG-INFO HOOK STDOUT: writing dependency_links to pystempel.egg-info/dependency_links.txt HOOK STDOUT: writing requirements to pystempel.egg-info/requires.txt HOOK STDOUT: writing top-level names to pystempel.egg-info/top_level.txt HOOK STDOUT: reading manifest file 'pystempel.egg-info/SOURCES.txt' HOOK STDOUT: writing manifest file 'pystempel.egg-info/SOURCES.txt' HOOK STDOUT: creating '/builddir/build/BUILD/pystempel-1.2.0/pystempel.dist-info' Handling sortedcontainers from wheel metadata: Requires-Dist Requirement not satisfied: sortedcontainers Handling tqdm from wheel metadata: Requires-Dist Requirement not satisfied: tqdm + RPM_EC=0 ++ jobs -p + exit 0 Wrote: /builddir/build/SRPMS/python-pystempel-1.2.0-1.el9.buildreqs.nosrc.rpm Child return code was: 11 Dynamic buildrequires detected Going to install missing buildrequires. See root.log for details. ENTER ['do_with_status'](['bash', '--login', '-c', '/usr/bin/rpmbuild -br --target x86_64 --nodeps /builddir/build/SPECS/python-pystempel.spec'], chrootPath='/var/lib/mock/centos-stream+epel-9-x86_64-1647767947.508400/root'env={'TERM': 'vt100', 'SHELL': '/bin/bash', 'HOME': '/builddir', 'HOSTNAME': 'mock', 'PATH': '/usr/bin:/bin:/usr/sbin:/sbin', 'PROMPT_COMMAND': 'printf "\\033]0;\\007"', 'PS1': ' \\s-\\v\\$ ', 'LANG': 'C.UTF-8'}shell=Falselogger=timeout=0uid=1000gid=135user='mockbuild'nspawn_args=['--capability=cap_ipc_lock', '--rlimit=RLIMIT_NOFILE=10240', '--capability=cap_ipc_lock', '--bind=/tmp/mock-resolv.jsyf8alv:/etc/resolv.conf', '--bind=/dev/btrfs-control', '--bind=/dev/loop-control', '--bind=/dev/loop0', '--bind=/dev/loop1', '--bind=/dev/loop2', '--bind=/dev/loop3', '--bind=/dev/loop4', '--bind=/dev/loop5', '--bind=/dev/loop6', '--bind=/dev/loop7', '--bind=/dev/loop8', '--bind=/dev/loop9', '--bind=/dev/loop10', '--bind=/dev/loop11']unshare_net=TrueraiseExc=FalseprintOutput=True) Using nspawn with args ['--capability=cap_ipc_lock', '--rlimit=RLIMIT_NOFILE=10240', '--capability=cap_ipc_lock', '--bind=/tmp/mock-resolv.jsyf8alv:/etc/resolv.conf', '--bind=/dev/btrfs-control', '--bind=/dev/loop-control', '--bind=/dev/loop0', '--bind=/dev/loop1', '--bind=/dev/loop2', '--bind=/dev/loop3', '--bind=/dev/loop4', '--bind=/dev/loop5', '--bind=/dev/loop6', '--bind=/dev/loop7', '--bind=/dev/loop8', '--bind=/dev/loop9', '--bind=/dev/loop10', '--bind=/dev/loop11'] Executing command: ['/usr/bin/systemd-nspawn', '-q', '-M', '607f5bff523e4889aa80d207723549e3', '-D', '/var/lib/mock/centos-stream+epel-9-x86_64-1647767947.508400/root', '-a', '-u', 'mockbuild', '--capability=cap_ipc_lock', '--rlimit=RLIMIT_NOFILE=10240', '--capability=cap_ipc_lock', '--bind=/tmp/mock-resolv.jsyf8alv:/etc/resolv.conf', '--bind=/dev/btrfs-control', '--bind=/dev/loop-control', '--bind=/dev/loop0', '--bind=/dev/loop1', '--bind=/dev/loop2', '--bind=/dev/loop3', '--bind=/dev/loop4', '--bind=/dev/loop5', '--bind=/dev/loop6', '--bind=/dev/loop7', '--bind=/dev/loop8', '--bind=/dev/loop9', '--bind=/dev/loop10', '--bind=/dev/loop11', '--console=pipe', '--setenv=TERM=vt100', '--setenv=SHELL=/bin/bash', '--setenv=HOME=/builddir', '--setenv=HOSTNAME=mock', '--setenv=PATH=/usr/bin:/bin:/usr/sbin:/sbin', '--setenv=PROMPT_COMMAND=printf "\\033]0;\\007"', '--setenv=PS1= \\s-\\v\\$ ', '--setenv=LANG=C.UTF-8', '--resolv-conf=off', 'bash', '--login', '-c', '/usr/bin/rpmbuild -br --target x86_64 --nodeps /builddir/build/SPECS/python-pystempel.spec'] with env {'TERM': 'vt100', 'SHELL': '/bin/bash', 'HOME': '/builddir', 'HOSTNAME': 'mock', 'PATH': '/usr/bin:/bin:/usr/sbin:/sbin', 'PROMPT_COMMAND': 'printf "\\033]0;\\007"', 'PS1': ' \\s-\\v\\$ ', 'LANG': 'C.UTF-8', 'SYSTEMD_NSPAWN_TMPFS_TMP': '0', 'SYSTEMD_SECCOMP': '0'} and shell False Building target platforms: x86_64 Building for target x86_64 setting SOURCE_DATE_EPOCH=1647648000 Executing(%prep): /bin/sh -e /var/tmp/rpm-tmp.Ev1lFC + umask 022 + cd /builddir/build/BUILD + cd /builddir/build/BUILD + rm -rf pystempel-1.2.0 + /usr/bin/gzip -dc /builddir/build/SOURCES/pystempel-1.2.0.tar.gz + /usr/bin/tar -xof - + STATUS=0 + '[' 0 -ne 0 ']' + cd pystempel-1.2.0 + /usr/bin/chmod -Rf a+rX,u+w,g-w,o-w . + RPM_EC=0 ++ jobs -p + exit 0 Executing(%generate_buildrequires): /bin/sh -e /var/tmp/rpm-tmp.QzBiQx + umask 022 + cd /builddir/build/BUILD + cd pystempel-1.2.0 + echo pyproject-rpm-macros + echo python3-devel + echo 'python3dist(pip) >= 19' + echo 'python3dist(packaging)' + '[' -f pyproject.toml ']' + '[' -f setup.py ']' + echo 'python3dist(setuptools) >= 40.8' + echo 'python3dist(wheel)' + rm -rfv '*.dist-info/' + '[' -f /usr/bin/python3 ']' + RPM_TOXENV=py39 + HOSTNAME=rpmbuild + /usr/bin/python3 -s /usr/lib/rpm/redhat/pyproject_buildrequires.py --generate-extras --python3_pkgversion 3 -r Handling setuptools >= 40.8 from default build backend Requirement satisfied: setuptools >= 40.8 (installed: setuptools 53.0.0) Handling wheel from default build backend Requirement satisfied: wheel (installed: wheel 0.36.2) HOOK STDOUT: Stempel Stemmer HOOK STDOUT: =============== HOOK STDOUT: HOOK STDOUT: .. image:: https://badge.fury.io/py/pystempel.svg HOOK STDOUT: :target: https://badge.fury.io/py/pystempel HOOK STDOUT: HOOK STDOUT: Python port of Stempel, an algorithmic stemmer for Polish language, originally written in Java. HOOK STDOUT: HOOK STDOUT: The original stemmer has been implemented as part of `Egothor Project`_, taken virtually unchanged to HOOK STDOUT: `Stempel Stemmer Java library`_ by Andrzej Białecki and next included as part of `Apache Lucene`_, HOOK STDOUT: a free and open-source search engine library. It is also used by `Elastic Search`_ search engine. HOOK STDOUT: HOOK STDOUT: .. _Egothor Project: https://www.egothor.org/product/egothor2/ HOOK STDOUT: .. _Stempel Stemmer Java library: http://www.getopt.org/stempel/index.html HOOK STDOUT: .. _Apache Lucene: https://lucene.apache.org/core/3_1_0/api/contrib-stempel/index.html HOOK STDOUT: .. _Elastic Search: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-stempel.html HOOK STDOUT: HOOK STDOUT: This package includes also high-quality stemming tables for Polish: original one pretrained by HOOK STDOUT: Andrzej Białecki on 20,000 training sets, and new one, pretrained on 259,080 training sets HOOK STDOUT: from Polimorf dictionary by me. HOOK STDOUT: HOOK STDOUT: The port does not include code for compiling stemming tables. HOOK STDOUT: HOOK STDOUT: .. _sjp.pl: https://sjp.pl/slownik/en/ HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: How to use HOOK STDOUT: ---------- HOOK STDOUT: HOOK STDOUT: Install in your local environment: HOOK STDOUT: HOOK STDOUT: .. code:: console HOOK STDOUT: HOOK STDOUT: pip install pystempel HOOK STDOUT: HOOK STDOUT: Use in your code: HOOK STDOUT: HOOK STDOUT: .. code:: python HOOK STDOUT: HOOK STDOUT: >>> from stempel import StempelStemmer HOOK STDOUT: HOOK STDOUT: Choose original (called default) version of a stemmer: HOOK STDOUT: HOOK STDOUT: .. code:: python HOOK STDOUT: HOOK STDOUT: >>> stemmer = StempelStemmer.default() HOOK STDOUT: HOOK STDOUT: or a version with new stemming table pretrained on training sets from Polimorf dictionary: HOOK STDOUT: HOOK STDOUT: .. code:: python HOOK STDOUT: HOOK STDOUT: >>> stemmer = StempelStemmer.polimorf() HOOK STDOUT: HOOK STDOUT: Stem: HOOK STDOUT: HOOK STDOUT: .. code:: python HOOK STDOUT: HOOK STDOUT: >>> for word in ['książka', 'książki', 'książkami', 'książkowa', 'książkowymi']: HOOK STDOUT: ... print(stemmer.stem(word)) HOOK STDOUT: ... HOOK STDOUT: książek HOOK STDOUT: książek HOOK STDOUT: książek HOOK STDOUT: książkowy HOOK STDOUT: książkowy HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: Choosing stemming table HOOK STDOUT: ----------------------- HOOK STDOUT: HOOK STDOUT: Performance between original (default) and new stemming table (Polimorf-based) varies significantly. HOOK STDOUT: The stemmer for the default stemming table is *understemming*, i.e., for multiple forms of the HOOK STDOUT: same lemma provides different stems more often (63%) than when using Polimorf-based stemming table HOOK STDOUT: (13%). However, the file footprint of the latter is bigger (2.2MB vs 0.3MB). Also loading takes HOOK STDOUT: longer (7.5 seconds vs. 1.3 seconds), though this happens only once, when a stemmer is created. Also, HOOK STDOUT: for original stemming table, the stemmer stems slightly faster: ~60000 vs ~51000 words per second. HOOK STDOUT: See `Evaluation Jupyter Notebook`_ for the detailed evaluation results. HOOK STDOUT: HOOK STDOUT: .. _Evaluation Jupyter Notebook: http://htmlpreview.github.io/?https://github.com/dzieciou/pystempel/blob/master/Evaluation.html HOOK STDOUT: HOOK STDOUT: Note also, that the licensing schema of both stemming tables differs, and hence licensing of HOOK STDOUT: data generated with each one. See "Licensing" section for the details. HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: Choosing between port and wrapper HOOK STDOUT: --------------------------------- HOOK STDOUT: HOOK STDOUT: If you work on an NLP project in Python you can choose between Python port and Python wrapper. HOOK STDOUT: Python port is what pystempel tries to achieve: translation from Java implementation to Python. HOOK STDOUT: Python wrapper is what I used in `tests`_: Python functions to call the original Java implementation of HOOK STDOUT: stemmer. You can find more about wrappers and ports in `Stackoverflow comparision post`_. Here, I HOOK STDOUT: compare both approaches to help you decide: HOOK STDOUT: HOOK STDOUT: * **Same accuracy**. I have verified Python port by comparing its output HOOK STDOUT: with output of original Java implementation for 331224 words from Free Polish dictionary HOOK STDOUT: (`sjp.pl`_) and for 100% of words it returns same output. HOOK STDOUT: * **Similar performance**. For mentioned dataset both stemmer versions achieved comparable performance. HOOK STDOUT: Python port completed stemming in 4.4 seconds, while Python wrapper -- in 5 seconds (Intel Core HOOK STDOUT: i5-6000 3.30 GHz, 16GB RAM, Windows 10, OpenJDK) HOOK STDOUT: * **Different setup**. Python wrapper requires additionally installation of Cython and pyjnius. HOOK STDOUT: Python wrapper will make also `debugging harder`_ (switching between two programming languages). HOOK STDOUT: HOOK STDOUT: .. _Stackoverflow comparision post: https://stackoverflow.com/questions/10113218/how-to-decide-when-to-wrap-port-write-from-scratch HOOK STDOUT: .. _debugging harder: https://stackoverflow.com/questions/6970359/find-an-efficient-way-to-integrate-different-language-libraries-into-one-project HOOK STDOUT: .. _tests: tests/ HOOK STDOUT: HOOK STDOUT: Options HOOK STDOUT: ------- HOOK STDOUT: HOOK STDOUT: To disable a progress bar when loading stemming tables, set environment variable ``DISABLE_TQDM=True``. HOOK STDOUT: HOOK STDOUT: Development setup HOOK STDOUT: ----------------- HOOK STDOUT: HOOK STDOUT: To setup environment for development you will need `Anaconda`_ installed. HOOK STDOUT: HOOK STDOUT: .. _Anaconda: https://anaconda.org/ HOOK STDOUT: HOOK STDOUT: .. code:: console HOOK STDOUT: HOOK STDOUT: conda env create --file environment.yml HOOK STDOUT: conda activate pystempel-env HOOK STDOUT: pre-commit install HOOK STDOUT: HOOK STDOUT: To run tests: HOOK STDOUT: HOOK STDOUT: .. code:: console HOOK STDOUT: HOOK STDOUT: curl https://repo1.maven.org/maven2/org/apache/lucene/lucene-analyzers-stempel/8.1.1/lucene-analyzers-stempel-8.1.1.jar > stempel-8.1.1.jar HOOK STDOUT: python -m pytest ./ HOOK STDOUT: HOOK STDOUT: To run benchmark: HOOK STDOUT: HOOK STDOUT: .. code:: console HOOK STDOUT: HOOK STDOUT: set PYTHONPATH=%PYTHONPATH%;%cd% HOOK STDOUT: python tests\test_benchmark.py HOOK STDOUT: HOOK STDOUT: Licensing HOOK STDOUT: --------- HOOK STDOUT: HOOK STDOUT: * **Code**: Most of the code is covered by `Egothor`_ Open Source License, an Apache-style license. HOOK STDOUT: The rest of the code is covered by the `Apache License 2.0`_. This should be clear from a preamble HOOK STDOUT: of each file. HOOK STDOUT: HOOK STDOUT: * **Data**: HOOK STDOUT: HOOK STDOUT: * The original pretrained stemming table is covered by `Apache License 2.0`_. HOOK STDOUT: HOOK STDOUT: * The new pretrained stemming table is covered by `2-Clause BSD License`_, similarly to the HOOK STDOUT: `Polimorf dictionary` it has been derived from. The copyright owner of both the stemming table HOOK STDOUT: and the dictionary is `Institute of Computer Science at Polish Academy of Science`_ (IPI PAN). HOOK STDOUT: HOOK STDOUT: * Polish dictionary used by the unit tests comes from `sjp.pl`_ and is covered by HOOK STDOUT: `Apache License 2.0`_ as well. HOOK STDOUT: HOOK STDOUT: .. _Egothor: https://www.egothor.org/product/egothor2/ HOOK STDOUT: .. _Apache License 2.0: https://www.apache.org/licenses/LICENSE-2.0 HOOK STDOUT: .. _Polimorf dictionary: dicts/ HOOK STDOUT: .. _2-Clause BSD License: data/polimorf/LICENSE.txt HOOK STDOUT: .. _Institute of Computer Science at Polish Academy of Science: https://ipipan.waw.pl/en/ HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: Alternatives HOOK STDOUT: ------------ HOOK STDOUT: HOOK STDOUT: * `Estem`_ is Erlang wrapper (not port) for Stempel stemmer. HOOK STDOUT: * `pl_stemmer`_ is a Python stemmer based on Porter's Algorithm. HOOK STDOUT: * `polish-stem`_ is a Python stemmer using Finite State Transducers. HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: .. _Estem: https://github.com/arcusfelis/estem HOOK STDOUT: .. _pl_stemmer: https://github.com/Tutanchamon/pl_stemmer HOOK STDOUT: .. _polish-stem: https://github.com/eugeniashurko/polish-stem HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: running egg_info HOOK STDOUT: writing pystempel.egg-info/PKG-INFO HOOK STDOUT: writing dependency_links to pystempel.egg-info/dependency_links.txt HOOK STDOUT: writing requirements to pystempel.egg-info/requires.txt HOOK STDOUT: writing top-level names to pystempel.egg-info/top_level.txt HOOK STDOUT: reading manifest file 'pystempel.egg-info/SOURCES.txt' HOOK STDOUT: writing manifest file 'pystempel.egg-info/SOURCES.txt' Handling wheel from get_requires_for_build_wheel Requirement satisfied: wheel (installed: wheel 0.36.2) HOOK STDOUT: Stempel Stemmer HOOK STDOUT: =============== HOOK STDOUT: HOOK STDOUT: .. image:: https://badge.fury.io/py/pystempel.svg HOOK STDOUT: :target: https://badge.fury.io/py/pystempel HOOK STDOUT: HOOK STDOUT: Python port of Stempel, an algorithmic stemmer for Polish language, originally written in Java. HOOK STDOUT: HOOK STDOUT: The original stemmer has been implemented as part of `Egothor Project`_, taken virtually unchanged to HOOK STDOUT: `Stempel Stemmer Java library`_ by Andrzej Białecki and next included as part of `Apache Lucene`_, HOOK STDOUT: a free and open-source search engine library. It is also used by `Elastic Search`_ search engine. HOOK STDOUT: HOOK STDOUT: .. _Egothor Project: https://www.egothor.org/product/egothor2/ HOOK STDOUT: .. _Stempel Stemmer Java library: http://www.getopt.org/stempel/index.html HOOK STDOUT: .. _Apache Lucene: https://lucene.apache.org/core/3_1_0/api/contrib-stempel/index.html HOOK STDOUT: .. _Elastic Search: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-stempel.html HOOK STDOUT: HOOK STDOUT: This package includes also high-quality stemming tables for Polish: original one pretrained by HOOK STDOUT: Andrzej Białecki on 20,000 training sets, and new one, pretrained on 259,080 training sets HOOK STDOUT: from Polimorf dictionary by me. HOOK STDOUT: HOOK STDOUT: The port does not include code for compiling stemming tables. HOOK STDOUT: HOOK STDOUT: .. _sjp.pl: https://sjp.pl/slownik/en/ HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: How to use HOOK STDOUT: ---------- HOOK STDOUT: HOOK STDOUT: Install in your local environment: HOOK STDOUT: HOOK STDOUT: .. code:: console HOOK STDOUT: HOOK STDOUT: pip install pystempel HOOK STDOUT: HOOK STDOUT: Use in your code: HOOK STDOUT: HOOK STDOUT: .. code:: python HOOK STDOUT: HOOK STDOUT: >>> from stempel import StempelStemmer HOOK STDOUT: HOOK STDOUT: Choose original (called default) version of a stemmer: HOOK STDOUT: HOOK STDOUT: .. code:: python HOOK STDOUT: HOOK STDOUT: >>> stemmer = StempelStemmer.default() HOOK STDOUT: HOOK STDOUT: or a version with new stemming table pretrained on training sets from Polimorf dictionary: HOOK STDOUT: HOOK STDOUT: .. code:: python HOOK STDOUT: HOOK STDOUT: >>> stemmer = StempelStemmer.polimorf() HOOK STDOUT: HOOK STDOUT: Stem: HOOK STDOUT: HOOK STDOUT: .. code:: python HOOK STDOUT: HOOK STDOUT: >>> for word in ['książka', 'książki', 'książkami', 'książkowa', 'książkowymi']: HOOK STDOUT: ... print(stemmer.stem(word)) HOOK STDOUT: ... HOOK STDOUT: książek HOOK STDOUT: książek HOOK STDOUT: książek HOOK STDOUT: książkowy HOOK STDOUT: książkowy HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: Choosing stemming table HOOK STDOUT: ----------------------- HOOK STDOUT: HOOK STDOUT: Performance between original (default) and new stemming table (Polimorf-based) varies significantly. HOOK STDOUT: The stemmer for the default stemming table is *understemming*, i.e., for multiple forms of the HOOK STDOUT: same lemma provides different stems more often (63%) than when using Polimorf-based stemming table HOOK STDOUT: (13%). However, the file footprint of the latter is bigger (2.2MB vs 0.3MB). Also loading takes HOOK STDOUT: longer (7.5 seconds vs. 1.3 seconds), though this happens only once, when a stemmer is created. Also, HOOK STDOUT: for original stemming table, the stemmer stems slightly faster: ~60000 vs ~51000 words per second. HOOK STDOUT: See `Evaluation Jupyter Notebook`_ for the detailed evaluation results. HOOK STDOUT: HOOK STDOUT: .. _Evaluation Jupyter Notebook: http://htmlpreview.github.io/?https://github.com/dzieciou/pystempel/blob/master/Evaluation.html HOOK STDOUT: HOOK STDOUT: Note also, that the licensing schema of both stemming tables differs, and hence licensing of HOOK STDOUT: data generated with each one. See "Licensing" section for the details. HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: Choosing between port and wrapper HOOK STDOUT: --------------------------------- HOOK STDOUT: HOOK STDOUT: If you work on an NLP project in Python you can choose between Python port and Python wrapper. HOOK STDOUT: Python port is what pystempel tries to achieve: translation from Java implementation to Python. HOOK STDOUT: Python wrapper is what I used in `tests`_: Python functions to call the original Java implementation of HOOK STDOUT: stemmer. You can find more about wrappers and ports in `Stackoverflow comparision post`_. Here, I HOOK STDOUT: compare both approaches to help you decide: HOOK STDOUT: HOOK STDOUT: * **Same accuracy**. I have verified Python port by comparing its output HOOK STDOUT: with output of original Java implementation for 331224 words from Free Polish dictionary HOOK STDOUT: (`sjp.pl`_) and for 100% of words it returns same output. HOOK STDOUT: * **Similar performance**. For mentioned dataset both stemmer versions achieved comparable performance. HOOK STDOUT: Python port completed stemming in 4.4 seconds, while Python wrapper -- in 5 seconds (Intel Core HOOK STDOUT: i5-6000 3.30 GHz, 16GB RAM, Windows 10, OpenJDK) HOOK STDOUT: * **Different setup**. Python wrapper requires additionally installation of Cython and pyjnius. HOOK STDOUT: Python wrapper will make also `debugging harder`_ (switching between two programming languages). HOOK STDOUT: HOOK STDOUT: .. _Stackoverflow comparision post: https://stackoverflow.com/questions/10113218/how-to-decide-when-to-wrap-port-write-from-scratch HOOK STDOUT: .. _debugging harder: https://stackoverflow.com/questions/6970359/find-an-efficient-way-to-integrate-different-language-libraries-into-one-project HOOK STDOUT: .. _tests: tests/ HOOK STDOUT: HOOK STDOUT: Options HOOK STDOUT: ------- HOOK STDOUT: HOOK STDOUT: To disable a progress bar when loading stemming tables, set environment variable ``DISABLE_TQDM=True``. HOOK STDOUT: HOOK STDOUT: Development setup HOOK STDOUT: ----------------- HOOK STDOUT: HOOK STDOUT: To setup environment for development you will need `Anaconda`_ installed. HOOK STDOUT: HOOK STDOUT: .. _Anaconda: https://anaconda.org/ HOOK STDOUT: HOOK STDOUT: .. code:: console HOOK STDOUT: HOOK STDOUT: conda env create --file environment.yml HOOK STDOUT: conda activate pystempel-env HOOK STDOUT: pre-commit install HOOK STDOUT: HOOK STDOUT: To run tests: HOOK STDOUT: HOOK STDOUT: .. code:: console HOOK STDOUT: HOOK STDOUT: curl https://repo1.maven.org/maven2/org/apache/lucene/lucene-analyzers-stempel/8.1.1/lucene-analyzers-stempel-8.1.1.jar > stempel-8.1.1.jar HOOK STDOUT: python -m pytest ./ HOOK STDOUT: HOOK STDOUT: To run benchmark: HOOK STDOUT: HOOK STDOUT: .. code:: console HOOK STDOUT: HOOK STDOUT: set PYTHONPATH=%PYTHONPATH%;%cd% HOOK STDOUT: python tests\test_benchmark.py HOOK STDOUT: HOOK STDOUT: Licensing HOOK STDOUT: --------- HOOK STDOUT: HOOK STDOUT: * **Code**: Most of the code is covered by `Egothor`_ Open Source License, an Apache-style license. HOOK STDOUT: The rest of the code is covered by the `Apache License 2.0`_. This should be clear from a preamble HOOK STDOUT: of each file. HOOK STDOUT: HOOK STDOUT: * **Data**: HOOK STDOUT: HOOK STDOUT: * The original pretrained stemming table is covered by `Apache License 2.0`_. HOOK STDOUT: HOOK STDOUT: * The new pretrained stemming table is covered by `2-Clause BSD License`_, similarly to the HOOK STDOUT: `Polimorf dictionary` it has been derived from. The copyright owner of both the stemming table HOOK STDOUT: and the dictionary is `Institute of Computer Science at Polish Academy of Science`_ (IPI PAN). HOOK STDOUT: HOOK STDOUT: * Polish dictionary used by the unit tests comes from `sjp.pl`_ and is covered by HOOK STDOUT: `Apache License 2.0`_ as well. HOOK STDOUT: HOOK STDOUT: .. _Egothor: https://www.egothor.org/product/egothor2/ HOOK STDOUT: .. _Apache License 2.0: https://www.apache.org/licenses/LICENSE-2.0 HOOK STDOUT: .. _Polimorf dictionary: dicts/ HOOK STDOUT: .. _2-Clause BSD License: data/polimorf/LICENSE.txt HOOK STDOUT: .. _Institute of Computer Science at Polish Academy of Science: https://ipipan.waw.pl/en/ HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: Alternatives HOOK STDOUT: ------------ HOOK STDOUT: HOOK STDOUT: * `Estem`_ is Erlang wrapper (not port) for Stempel stemmer. HOOK STDOUT: * `pl_stemmer`_ is a Python stemmer based on Porter's Algorithm. HOOK STDOUT: * `polish-stem`_ is a Python stemmer using Finite State Transducers. HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: .. _Estem: https://github.com/arcusfelis/estem HOOK STDOUT: .. _pl_stemmer: https://github.com/Tutanchamon/pl_stemmer HOOK STDOUT: .. _polish-stem: https://github.com/eugeniashurko/polish-stem HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: running dist_info HOOK STDOUT: writing pystempel.egg-info/PKG-INFO HOOK STDOUT: writing dependency_links to pystempel.egg-info/dependency_links.txt HOOK STDOUT: writing requirements to pystempel.egg-info/requires.txt HOOK STDOUT: writing top-level names to pystempel.egg-info/top_level.txt HOOK STDOUT: reading manifest file 'pystempel.egg-info/SOURCES.txt' HOOK STDOUT: writing manifest file 'pystempel.egg-info/SOURCES.txt' HOOK STDOUT: creating '/builddir/build/BUILD/pystempel-1.2.0/pystempel.dist-info' Handling sortedcontainers from wheel metadata: Requires-Dist Requirement satisfied: sortedcontainers (installed: sortedcontainers 2.4.0) Handling tqdm from wheel metadata: Requires-Dist Requirement satisfied: tqdm (installed: tqdm 4.63.0) + RPM_EC=0 ++ jobs -p + exit 0 Wrote: /builddir/build/SRPMS/python-pystempel-1.2.0-1.el9.buildreqs.nosrc.rpm Child return code was: 11 Dynamic buildrequires detected Going to install missing buildrequires. See root.log for details. ENTER ['do_with_status'](['bash', '--login', '-c', '/usr/bin/rpmbuild -ba --noprep --target x86_64 --nodeps /builddir/build/SPECS/python-pystempel.spec'], chrootPath='/var/lib/mock/centos-stream+epel-9-x86_64-1647767947.508400/root'env={'TERM': 'vt100', 'SHELL': '/bin/bash', 'HOME': '/builddir', 'HOSTNAME': 'mock', 'PATH': '/usr/bin:/bin:/usr/sbin:/sbin', 'PROMPT_COMMAND': 'printf "\\033]0;\\007"', 'PS1': ' \\s-\\v\\$ ', 'LANG': 'C.UTF-8'}shell=Falselogger=timeout=0uid=1000gid=135user='mockbuild'nspawn_args=['--capability=cap_ipc_lock', '--rlimit=RLIMIT_NOFILE=10240', '--capability=cap_ipc_lock', '--bind=/tmp/mock-resolv.jsyf8alv:/etc/resolv.conf', '--bind=/dev/btrfs-control', '--bind=/dev/loop-control', '--bind=/dev/loop0', '--bind=/dev/loop1', '--bind=/dev/loop2', '--bind=/dev/loop3', '--bind=/dev/loop4', '--bind=/dev/loop5', '--bind=/dev/loop6', '--bind=/dev/loop7', '--bind=/dev/loop8', '--bind=/dev/loop9', '--bind=/dev/loop10', '--bind=/dev/loop11']unshare_net=TrueprintOutput=True) Using nspawn with args ['--capability=cap_ipc_lock', '--rlimit=RLIMIT_NOFILE=10240', '--capability=cap_ipc_lock', '--bind=/tmp/mock-resolv.jsyf8alv:/etc/resolv.conf', '--bind=/dev/btrfs-control', '--bind=/dev/loop-control', '--bind=/dev/loop0', '--bind=/dev/loop1', '--bind=/dev/loop2', '--bind=/dev/loop3', '--bind=/dev/loop4', '--bind=/dev/loop5', '--bind=/dev/loop6', '--bind=/dev/loop7', '--bind=/dev/loop8', '--bind=/dev/loop9', '--bind=/dev/loop10', '--bind=/dev/loop11'] Executing command: ['/usr/bin/systemd-nspawn', '-q', '-M', 'e57ac4267da441628ec75fcc0a563420', '-D', '/var/lib/mock/centos-stream+epel-9-x86_64-1647767947.508400/root', '-a', '-u', 'mockbuild', '--capability=cap_ipc_lock', '--rlimit=RLIMIT_NOFILE=10240', '--capability=cap_ipc_lock', '--bind=/tmp/mock-resolv.jsyf8alv:/etc/resolv.conf', '--bind=/dev/btrfs-control', '--bind=/dev/loop-control', '--bind=/dev/loop0', '--bind=/dev/loop1', '--bind=/dev/loop2', '--bind=/dev/loop3', '--bind=/dev/loop4', '--bind=/dev/loop5', '--bind=/dev/loop6', '--bind=/dev/loop7', '--bind=/dev/loop8', '--bind=/dev/loop9', '--bind=/dev/loop10', '--bind=/dev/loop11', '--console=pipe', '--setenv=TERM=vt100', '--setenv=SHELL=/bin/bash', '--setenv=HOME=/builddir', '--setenv=HOSTNAME=mock', '--setenv=PATH=/usr/bin:/bin:/usr/sbin:/sbin', '--setenv=PROMPT_COMMAND=printf "\\033]0;\\007"', '--setenv=PS1= \\s-\\v\\$ ', '--setenv=LANG=C.UTF-8', '--resolv-conf=off', 'bash', '--login', '-c', '/usr/bin/rpmbuild -ba --noprep --target x86_64 --nodeps /builddir/build/SPECS/python-pystempel.spec'] with env {'TERM': 'vt100', 'SHELL': '/bin/bash', 'HOME': '/builddir', 'HOSTNAME': 'mock', 'PATH': '/usr/bin:/bin:/usr/sbin:/sbin', 'PROMPT_COMMAND': 'printf "\\033]0;\\007"', 'PS1': ' \\s-\\v\\$ ', 'LANG': 'C.UTF-8', 'SYSTEMD_NSPAWN_TMPFS_TMP': '0', 'SYSTEMD_SECCOMP': '0'} and shell False Building target platforms: x86_64 Building for target x86_64 setting SOURCE_DATE_EPOCH=1647648000 Executing(%generate_buildrequires): /bin/sh -e /var/tmp/rpm-tmp.WYBFMs + umask 022 + cd /builddir/build/BUILD + cd pystempel-1.2.0 + echo pyproject-rpm-macros + echo python3-devel + echo 'python3dist(pip) >= 19' + echo 'python3dist(packaging)' + '[' -f pyproject.toml ']' + '[' -f setup.py ']' + echo 'python3dist(setuptools) >= 40.8' + echo 'python3dist(wheel)' + rm -rfv pystempel.dist-info/ removed 'pystempel.dist-info/METADATA' removed 'pystempel.dist-info/top_level.txt' removed directory 'pystempel.dist-info/' + '[' -f /usr/bin/python3 ']' + RPM_TOXENV=py39 + HOSTNAME=rpmbuild + /usr/bin/python3 -s /usr/lib/rpm/redhat/pyproject_buildrequires.py --generate-extras --python3_pkgversion 3 -r Handling setuptools >= 40.8 from default build backend Requirement satisfied: setuptools >= 40.8 (installed: setuptools 53.0.0) Handling wheel from default build backend Requirement satisfied: wheel (installed: wheel 0.36.2) HOOK STDOUT: Stempel Stemmer HOOK STDOUT: =============== HOOK STDOUT: HOOK STDOUT: .. image:: https://badge.fury.io/py/pystempel.svg HOOK STDOUT: :target: https://badge.fury.io/py/pystempel HOOK STDOUT: HOOK STDOUT: Python port of Stempel, an algorithmic stemmer for Polish language, originally written in Java. HOOK STDOUT: HOOK STDOUT: The original stemmer has been implemented as part of `Egothor Project`_, taken virtually unchanged to HOOK STDOUT: `Stempel Stemmer Java library`_ by Andrzej Białecki and next included as part of `Apache Lucene`_, HOOK STDOUT: a free and open-source search engine library. It is also used by `Elastic Search`_ search engine. HOOK STDOUT: HOOK STDOUT: .. _Egothor Project: https://www.egothor.org/product/egothor2/ HOOK STDOUT: .. _Stempel Stemmer Java library: http://www.getopt.org/stempel/index.html HOOK STDOUT: .. _Apache Lucene: https://lucene.apache.org/core/3_1_0/api/contrib-stempel/index.html HOOK STDOUT: .. _Elastic Search: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-stempel.html HOOK STDOUT: HOOK STDOUT: This package includes also high-quality stemming tables for Polish: original one pretrained by HOOK STDOUT: Andrzej Białecki on 20,000 training sets, and new one, pretrained on 259,080 training sets HOOK STDOUT: from Polimorf dictionary by me. HOOK STDOUT: HOOK STDOUT: The port does not include code for compiling stemming tables. HOOK STDOUT: HOOK STDOUT: .. _sjp.pl: https://sjp.pl/slownik/en/ HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: How to use HOOK STDOUT: ---------- HOOK STDOUT: HOOK STDOUT: Install in your local environment: HOOK STDOUT: HOOK STDOUT: .. code:: console HOOK STDOUT: HOOK STDOUT: pip install pystempel HOOK STDOUT: HOOK STDOUT: Use in your code: HOOK STDOUT: HOOK STDOUT: .. code:: python HOOK STDOUT: HOOK STDOUT: >>> from stempel import StempelStemmer HOOK STDOUT: HOOK STDOUT: Choose original (called default) version of a stemmer: HOOK STDOUT: HOOK STDOUT: .. code:: python HOOK STDOUT: HOOK STDOUT: >>> stemmer = StempelStemmer.default() HOOK STDOUT: HOOK STDOUT: or a version with new stemming table pretrained on training sets from Polimorf dictionary: HOOK STDOUT: HOOK STDOUT: .. code:: python HOOK STDOUT: HOOK STDOUT: >>> stemmer = StempelStemmer.polimorf() HOOK STDOUT: HOOK STDOUT: Stem: HOOK STDOUT: HOOK STDOUT: .. code:: python HOOK STDOUT: HOOK STDOUT: >>> for word in ['książka', 'książki', 'książkami', 'książkowa', 'książkowymi']: HOOK STDOUT: ... print(stemmer.stem(word)) HOOK STDOUT: ... HOOK STDOUT: książek HOOK STDOUT: książek HOOK STDOUT: książek HOOK STDOUT: książkowy HOOK STDOUT: książkowy HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: Choosing stemming table HOOK STDOUT: ----------------------- HOOK STDOUT: HOOK STDOUT: Performance between original (default) and new stemming table (Polimorf-based) varies significantly. HOOK STDOUT: The stemmer for the default stemming table is *understemming*, i.e., for multiple forms of the HOOK STDOUT: same lemma provides different stems more often (63%) than when using Polimorf-based stemming table HOOK STDOUT: (13%). However, the file footprint of the latter is bigger (2.2MB vs 0.3MB). Also loading takes HOOK STDOUT: longer (7.5 seconds vs. 1.3 seconds), though this happens only once, when a stemmer is created. Also, HOOK STDOUT: for original stemming table, the stemmer stems slightly faster: ~60000 vs ~51000 words per second. HOOK STDOUT: See `Evaluation Jupyter Notebook`_ for the detailed evaluation results. HOOK STDOUT: HOOK STDOUT: .. _Evaluation Jupyter Notebook: http://htmlpreview.github.io/?https://github.com/dzieciou/pystempel/blob/master/Evaluation.html HOOK STDOUT: HOOK STDOUT: Note also, that the licensing schema of both stemming tables differs, and hence licensing of HOOK STDOUT: data generated with each one. See "Licensing" section for the details. HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: Choosing between port and wrapper HOOK STDOUT: --------------------------------- HOOK STDOUT: HOOK STDOUT: If you work on an NLP project in Python you can choose between Python port and Python wrapper. HOOK STDOUT: Python port is what pystempel tries to achieve: translation from Java implementation to Python. HOOK STDOUT: Python wrapper is what I used in `tests`_: Python functions to call the original Java implementation of HOOK STDOUT: stemmer. You can find more about wrappers and ports in `Stackoverflow comparision post`_. Here, I HOOK STDOUT: compare both approaches to help you decide: HOOK STDOUT: HOOK STDOUT: * **Same accuracy**. I have verified Python port by comparing its output HOOK STDOUT: with output of original Java implementation for 331224 words from Free Polish dictionary HOOK STDOUT: (`sjp.pl`_) and for 100% of words it returns same output. HOOK STDOUT: * **Similar performance**. For mentioned dataset both stemmer versions achieved comparable performance. HOOK STDOUT: Python port completed stemming in 4.4 seconds, while Python wrapper -- in 5 seconds (Intel Core HOOK STDOUT: i5-6000 3.30 GHz, 16GB RAM, Windows 10, OpenJDK) HOOK STDOUT: * **Different setup**. Python wrapper requires additionally installation of Cython and pyjnius. HOOK STDOUT: Python wrapper will make also `debugging harder`_ (switching between two programming languages). HOOK STDOUT: HOOK STDOUT: .. _Stackoverflow comparision post: https://stackoverflow.com/questions/10113218/how-to-decide-when-to-wrap-port-write-from-scratch HOOK STDOUT: .. _debugging harder: https://stackoverflow.com/questions/6970359/find-an-efficient-way-to-integrate-different-language-libraries-into-one-project HOOK STDOUT: .. _tests: tests/ HOOK STDOUT: HOOK STDOUT: Options HOOK STDOUT: ------- HOOK STDOUT: HOOK STDOUT: To disable a progress bar when loading stemming tables, set environment variable ``DISABLE_TQDM=True``. HOOK STDOUT: HOOK STDOUT: Development setup HOOK STDOUT: ----------------- HOOK STDOUT: HOOK STDOUT: To setup environment for development you will need `Anaconda`_ installed. HOOK STDOUT: HOOK STDOUT: .. _Anaconda: https://anaconda.org/ HOOK STDOUT: HOOK STDOUT: .. code:: console HOOK STDOUT: HOOK STDOUT: conda env create --file environment.yml HOOK STDOUT: conda activate pystempel-env HOOK STDOUT: pre-commit install HOOK STDOUT: HOOK STDOUT: To run tests: HOOK STDOUT: HOOK STDOUT: .. code:: console HOOK STDOUT: HOOK STDOUT: curl https://repo1.maven.org/maven2/org/apache/lucene/lucene-analyzers-stempel/8.1.1/lucene-analyzers-stempel-8.1.1.jar > stempel-8.1.1.jar HOOK STDOUT: python -m pytest ./ HOOK STDOUT: HOOK STDOUT: To run benchmark: HOOK STDOUT: HOOK STDOUT: .. code:: console HOOK STDOUT: HOOK STDOUT: set PYTHONPATH=%PYTHONPATH%;%cd% HOOK STDOUT: python tests\test_benchmark.py HOOK STDOUT: HOOK STDOUT: Licensing HOOK STDOUT: --------- HOOK STDOUT: HOOK STDOUT: * **Code**: Most of the code is covered by `Egothor`_ Open Source License, an Apache-style license. HOOK STDOUT: The rest of the code is covered by the `Apache License 2.0`_. This should be clear from a preamble HOOK STDOUT: of each file. HOOK STDOUT: HOOK STDOUT: * **Data**: HOOK STDOUT: HOOK STDOUT: * The original pretrained stemming table is covered by `Apache License 2.0`_. HOOK STDOUT: HOOK STDOUT: * The new pretrained stemming table is covered by `2-Clause BSD License`_, similarly to the HOOK STDOUT: `Polimorf dictionary` it has been derived from. The copyright owner of both the stemming table HOOK STDOUT: and the dictionary is `Institute of Computer Science at Polish Academy of Science`_ (IPI PAN). HOOK STDOUT: HOOK STDOUT: * Polish dictionary used by the unit tests comes from `sjp.pl`_ and is covered by HOOK STDOUT: `Apache License 2.0`_ as well. HOOK STDOUT: HOOK STDOUT: .. _Egothor: https://www.egothor.org/product/egothor2/ HOOK STDOUT: .. _Apache License 2.0: https://www.apache.org/licenses/LICENSE-2.0 HOOK STDOUT: .. _Polimorf dictionary: dicts/ HOOK STDOUT: .. _2-Clause BSD License: data/polimorf/LICENSE.txt HOOK STDOUT: .. _Institute of Computer Science at Polish Academy of Science: https://ipipan.waw.pl/en/ HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: Alternatives HOOK STDOUT: ------------ HOOK STDOUT: HOOK STDOUT: * `Estem`_ is Erlang wrapper (not port) for Stempel stemmer. HOOK STDOUT: * `pl_stemmer`_ is a Python stemmer based on Porter's Algorithm. HOOK STDOUT: * `polish-stem`_ is a Python stemmer using Finite State Transducers. HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: .. _Estem: https://github.com/arcusfelis/estem HOOK STDOUT: .. _pl_stemmer: https://github.com/Tutanchamon/pl_stemmer HOOK STDOUT: .. _polish-stem: https://github.com/eugeniashurko/polish-stem HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: running egg_info HOOK STDOUT: creating pystempel.egg-info HOOK STDOUT: writing pystempel.egg-info/PKG-INFO HOOK STDOUT: writing dependency_links to pystempel.egg-info/dependency_links.txt HOOK STDOUT: writing requirements to pystempel.egg-info/requires.txt HOOK STDOUT: writing top-level names to pystempel.egg-info/top_level.txt HOOK STDOUT: writing manifest file 'pystempel.egg-info/SOURCES.txt' HOOK STDOUT: reading manifest file 'pystempel.egg-info/SOURCES.txt' HOOK STDOUT: writing manifest file 'pystempel.egg-info/SOURCES.txt' Handling wheel from get_requires_for_build_wheel Requirement satisfied: wheel (installed: wheel 0.36.2) HOOK STDOUT: Stempel Stemmer HOOK STDOUT: =============== HOOK STDOUT: HOOK STDOUT: .. image:: https://badge.fury.io/py/pystempel.svg HOOK STDOUT: :target: https://badge.fury.io/py/pystempel HOOK STDOUT: HOOK STDOUT: Python port of Stempel, an algorithmic stemmer for Polish language, originally written in Java. HOOK STDOUT: HOOK STDOUT: The original stemmer has been implemented as part of `Egothor Project`_, taken virtually unchanged to HOOK STDOUT: `Stempel Stemmer Java library`_ by Andrzej Białecki and next included as part of `Apache Lucene`_, HOOK STDOUT: a free and open-source search engine library. It is also used by `Elastic Search`_ search engine. HOOK STDOUT: HOOK STDOUT: .. _Egothor Project: https://www.egothor.org/product/egothor2/ HOOK STDOUT: .. _Stempel Stemmer Java library: http://www.getopt.org/stempel/index.html HOOK STDOUT: .. _Apache Lucene: https://lucene.apache.org/core/3_1_0/api/contrib-stempel/index.html HOOK STDOUT: .. _Elastic Search: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-stempel.html HOOK STDOUT: HOOK STDOUT: This package includes also high-quality stemming tables for Polish: original one pretrained by HOOK STDOUT: Andrzej Białecki on 20,000 training sets, and new one, pretrained on 259,080 training sets HOOK STDOUT: from Polimorf dictionary by me. HOOK STDOUT: HOOK STDOUT: The port does not include code for compiling stemming tables. HOOK STDOUT: HOOK STDOUT: .. _sjp.pl: https://sjp.pl/slownik/en/ HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: How to use HOOK STDOUT: ---------- HOOK STDOUT: HOOK STDOUT: Install in your local environment: HOOK STDOUT: HOOK STDOUT: .. code:: console HOOK STDOUT: HOOK STDOUT: pip install pystempel HOOK STDOUT: HOOK STDOUT: Use in your code: HOOK STDOUT: HOOK STDOUT: .. code:: python HOOK STDOUT: HOOK STDOUT: >>> from stempel import StempelStemmer HOOK STDOUT: HOOK STDOUT: Choose original (called default) version of a stemmer: HOOK STDOUT: HOOK STDOUT: .. code:: python HOOK STDOUT: HOOK STDOUT: >>> stemmer = StempelStemmer.default() HOOK STDOUT: HOOK STDOUT: or a version with new stemming table pretrained on training sets from Polimorf dictionary: HOOK STDOUT: HOOK STDOUT: .. code:: python HOOK STDOUT: HOOK STDOUT: >>> stemmer = StempelStemmer.polimorf() HOOK STDOUT: HOOK STDOUT: Stem: HOOK STDOUT: HOOK STDOUT: .. code:: python HOOK STDOUT: HOOK STDOUT: >>> for word in ['książka', 'książki', 'książkami', 'książkowa', 'książkowymi']: HOOK STDOUT: ... print(stemmer.stem(word)) HOOK STDOUT: ... HOOK STDOUT: książek HOOK STDOUT: książek HOOK STDOUT: książek HOOK STDOUT: książkowy HOOK STDOUT: książkowy HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: Choosing stemming table HOOK STDOUT: ----------------------- HOOK STDOUT: HOOK STDOUT: Performance between original (default) and new stemming table (Polimorf-based) varies significantly. HOOK STDOUT: The stemmer for the default stemming table is *understemming*, i.e., for multiple forms of the HOOK STDOUT: same lemma provides different stems more often (63%) than when using Polimorf-based stemming table HOOK STDOUT: (13%). However, the file footprint of the latter is bigger (2.2MB vs 0.3MB). Also loading takes HOOK STDOUT: longer (7.5 seconds vs. 1.3 seconds), though this happens only once, when a stemmer is created. Also, HOOK STDOUT: for original stemming table, the stemmer stems slightly faster: ~60000 vs ~51000 words per second. HOOK STDOUT: See `Evaluation Jupyter Notebook`_ for the detailed evaluation results. HOOK STDOUT: HOOK STDOUT: .. _Evaluation Jupyter Notebook: http://htmlpreview.github.io/?https://github.com/dzieciou/pystempel/blob/master/Evaluation.html HOOK STDOUT: HOOK STDOUT: Note also, that the licensing schema of both stemming tables differs, and hence licensing of HOOK STDOUT: data generated with each one. See "Licensing" section for the details. HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: Choosing between port and wrapper HOOK STDOUT: --------------------------------- HOOK STDOUT: HOOK STDOUT: If you work on an NLP project in Python you can choose between Python port and Python wrapper. HOOK STDOUT: Python port is what pystempel tries to achieve: translation from Java implementation to Python. HOOK STDOUT: Python wrapper is what I used in `tests`_: Python functions to call the original Java implementation of HOOK STDOUT: stemmer. You can find more about wrappers and ports in `Stackoverflow comparision post`_. Here, I HOOK STDOUT: compare both approaches to help you decide: HOOK STDOUT: HOOK STDOUT: * **Same accuracy**. I have verified Python port by comparing its output HOOK STDOUT: with output of original Java implementation for 331224 words from Free Polish dictionary HOOK STDOUT: (`sjp.pl`_) and for 100% of words it returns same output. HOOK STDOUT: * **Similar performance**. For mentioned dataset both stemmer versions achieved comparable performance. HOOK STDOUT: Python port completed stemming in 4.4 seconds, while Python wrapper -- in 5 seconds (Intel Core HOOK STDOUT: i5-6000 3.30 GHz, 16GB RAM, Windows 10, OpenJDK) HOOK STDOUT: * **Different setup**. Python wrapper requires additionally installation of Cython and pyjnius. HOOK STDOUT: Python wrapper will make also `debugging harder`_ (switching between two programming languages). HOOK STDOUT: HOOK STDOUT: .. _Stackoverflow comparision post: https://stackoverflow.com/questions/10113218/how-to-decide-when-to-wrap-port-write-from-scratch HOOK STDOUT: .. _debugging harder: https://stackoverflow.com/questions/6970359/find-an-efficient-way-to-integrate-different-language-libraries-into-one-project HOOK STDOUT: .. _tests: tests/ HOOK STDOUT: HOOK STDOUT: Options HOOK STDOUT: ------- HOOK STDOUT: HOOK STDOUT: To disable a progress bar when loading stemming tables, set environment variable ``DISABLE_TQDM=True``. HOOK STDOUT: HOOK STDOUT: Development setup HOOK STDOUT: ----------------- HOOK STDOUT: HOOK STDOUT: To setup environment for development you will need `Anaconda`_ installed. HOOK STDOUT: HOOK STDOUT: .. _Anaconda: https://anaconda.org/ HOOK STDOUT: HOOK STDOUT: .. code:: console HOOK STDOUT: HOOK STDOUT: conda env create --file environment.yml HOOK STDOUT: conda activate pystempel-env HOOK STDOUT: pre-commit install HOOK STDOUT: HOOK STDOUT: To run tests: HOOK STDOUT: HOOK STDOUT: .. code:: console HOOK STDOUT: HOOK STDOUT: curl https://repo1.maven.org/maven2/org/apache/lucene/lucene-analyzers-stempel/8.1.1/lucene-analyzers-stempel-8.1.1.jar > stempel-8.1.1.jar HOOK STDOUT: python -m pytest ./ HOOK STDOUT: HOOK STDOUT: To run benchmark: HOOK STDOUT: HOOK STDOUT: .. code:: console HOOK STDOUT: HOOK STDOUT: set PYTHONPATH=%PYTHONPATH%;%cd% HOOK STDOUT: python tests\test_benchmark.py HOOK STDOUT: HOOK STDOUT: Licensing HOOK STDOUT: --------- HOOK STDOUT: HOOK STDOUT: * **Code**: Most of the code is covered by `Egothor`_ Open Source License, an Apache-style license. HOOK STDOUT: The rest of the code is covered by the `Apache License 2.0`_. This should be clear from a preamble HOOK STDOUT: of each file. HOOK STDOUT: HOOK STDOUT: * **Data**: HOOK STDOUT: HOOK STDOUT: * The original pretrained stemming table is covered by `Apache License 2.0`_. HOOK STDOUT: HOOK STDOUT: * The new pretrained stemming table is covered by `2-Clause BSD License`_, similarly to the HOOK STDOUT: `Polimorf dictionary` it has been derived from. The copyright owner of both the stemming table HOOK STDOUT: and the dictionary is `Institute of Computer Science at Polish Academy of Science`_ (IPI PAN). HOOK STDOUT: HOOK STDOUT: * Polish dictionary used by the unit tests comes from `sjp.pl`_ and is covered by HOOK STDOUT: `Apache License 2.0`_ as well. HOOK STDOUT: HOOK STDOUT: .. _Egothor: https://www.egothor.org/product/egothor2/ HOOK STDOUT: .. _Apache License 2.0: https://www.apache.org/licenses/LICENSE-2.0 HOOK STDOUT: .. _Polimorf dictionary: dicts/ HOOK STDOUT: .. _2-Clause BSD License: data/polimorf/LICENSE.txt HOOK STDOUT: .. _Institute of Computer Science at Polish Academy of Science: https://ipipan.waw.pl/en/ HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: Alternatives HOOK STDOUT: ------------ HOOK STDOUT: HOOK STDOUT: * `Estem`_ is Erlang wrapper (not port) for Stempel stemmer. HOOK STDOUT: * `pl_stemmer`_ is a Python stemmer based on Porter's Algorithm. HOOK STDOUT: * `polish-stem`_ is a Python stemmer using Finite State Transducers. HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: .. _Estem: https://github.com/arcusfelis/estem HOOK STDOUT: .. _pl_stemmer: https://github.com/Tutanchamon/pl_stemmer HOOK STDOUT: .. _polish-stem: https://github.com/eugeniashurko/polish-stem HOOK STDOUT: HOOK STDOUT: HOOK STDOUT: running dist_info HOOK STDOUT: writing pystempel.egg-info/PKG-INFO HOOK STDOUT: writing dependency_links to pystempel.egg-info/dependency_links.txt HOOK STDOUT: writing requirements to pystempel.egg-info/requires.txt HOOK STDOUT: writing top-level names to pystempel.egg-info/top_level.txt HOOK STDOUT: reading manifest file 'pystempel.egg-info/SOURCES.txt' HOOK STDOUT: writing manifest file 'pystempel.egg-info/SOURCES.txt' HOOK STDOUT: creating '/builddir/build/BUILD/pystempel-1.2.0/pystempel.dist-info' Handling sortedcontainers from wheel metadata: Requires-Dist Requirement satisfied: sortedcontainers (installed: sortedcontainers 2.4.0) Handling tqdm from wheel metadata: Requires-Dist Requirement satisfied: tqdm (installed: tqdm 4.63.0) + RPM_EC=0 ++ jobs -p + exit 0 Executing(%build): /bin/sh -e /var/tmp/rpm-tmp.EmBVGz + umask 022 + cd /builddir/build/BUILD + cd pystempel-1.2.0 + mkdir -p /builddir/build/BUILD/pystempel-1.2.0/.pyproject-builddir + CFLAGS='-O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection' + LDFLAGS='-Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 ' + TMPDIR=/builddir/build/BUILD/pystempel-1.2.0/.pyproject-builddir + /usr/bin/python3 -m pip wheel --wheel-dir /builddir/build/BUILD/pystempel-1.2.0/pyproject-wheeldir --no-deps --use-pep517 --no-build-isolation --disable-pip-version-check --no-clean --progress-bar off --verbose . Processing /builddir/build/BUILD/pystempel-1.2.0 Preparing metadata (pyproject.toml): started Running command Preparing metadata (pyproject.toml) Stempel Stemmer =============== .. image:: https://badge.fury.io/py/pystempel.svg :target: https://badge.fury.io/py/pystempel Python port of Stempel, an algorithmic stemmer for Polish language, originally written in Java. The original stemmer has been implemented as part of `Egothor Project`_, taken virtually unchanged to `Stempel Stemmer Java library`_ by Andrzej Białecki and next included as part of `Apache Lucene`_, a free and open-source search engine library. It is also used by `Elastic Search`_ search engine. .. _Egothor Project: https://www.egothor.org/product/egothor2/ .. _Stempel Stemmer Java library: http://www.getopt.org/stempel/index.html .. _Apache Lucene: https://lucene.apache.org/core/3_1_0/api/contrib-stempel/index.html .. _Elastic Search: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-stempel.html This package includes also high-quality stemming tables for Polish: original one pretrained by Andrzej Białecki on 20,000 training sets, and new one, pretrained on 259,080 training sets from Polimorf dictionary by me. The port does not include code for compiling stemming tables. .. _sjp.pl: https://sjp.pl/slownik/en/ How to use ---------- Install in your local environment: .. code:: console pip install pystempel Use in your code: .. code:: python >>> from stempel import StempelStemmer Choose original (called default) version of a stemmer: .. code:: python >>> stemmer = StempelStemmer.default() or a version with new stemming table pretrained on training sets from Polimorf dictionary: .. code:: python >>> stemmer = StempelStemmer.polimorf() Stem: .. code:: python >>> for word in ['książka', 'książki', 'książkami', 'książkowa', 'książkowymi']: ... print(stemmer.stem(word)) ... książek książek książek książkowy książkowy Choosing stemming table ----------------------- Performance between original (default) and new stemming table (Polimorf-based) varies significantly. The stemmer for the default stemming table is *understemming*, i.e., for multiple forms of the same lemma provides different stems more often (63%) than when using Polimorf-based stemming table (13%). However, the file footprint of the latter is bigger (2.2MB vs 0.3MB). Also loading takes longer (7.5 seconds vs. 1.3 seconds), though this happens only once, when a stemmer is created. Also, for original stemming table, the stemmer stems slightly faster: ~60000 vs ~51000 words per second. See `Evaluation Jupyter Notebook`_ for the detailed evaluation results. .. _Evaluation Jupyter Notebook: http://htmlpreview.github.io/?https://github.com/dzieciou/pystempel/blob/master/Evaluation.html Note also, that the licensing schema of both stemming tables differs, and hence licensing of data generated with each one. See "Licensing" section for the details. Choosing between port and wrapper --------------------------------- If you work on an NLP project in Python you can choose between Python port and Python wrapper. Python port is what pystempel tries to achieve: translation from Java implementation to Python. Python wrapper is what I used in `tests`_: Python functions to call the original Java implementation of stemmer. You can find more about wrappers and ports in `Stackoverflow comparision post`_. Here, I compare both approaches to help you decide: * **Same accuracy**. I have verified Python port by comparing its output with output of original Java implementation for 331224 words from Free Polish dictionary (`sjp.pl`_) and for 100% of words it returns same output. * **Similar performance**. For mentioned dataset both stemmer versions achieved comparable performance. Python port completed stemming in 4.4 seconds, while Python wrapper -- in 5 seconds (Intel Core i5-6000 3.30 GHz, 16GB RAM, Windows 10, OpenJDK) * **Different setup**. Python wrapper requires additionally installation of Cython and pyjnius. Python wrapper will make also `debugging harder`_ (switching between two programming languages). .. _Stackoverflow comparision post: https://stackoverflow.com/questions/10113218/how-to-decide-when-to-wrap-port-write-from-scratch .. _debugging harder: https://stackoverflow.com/questions/6970359/find-an-efficient-way-to-integrate-different-language-libraries-into-one-project .. _tests: tests/ Options ------- To disable a progress bar when loading stemming tables, set environment variable ``DISABLE_TQDM=True``. Development setup ----------------- To setup environment for development you will need `Anaconda`_ installed. .. _Anaconda: https://anaconda.org/ .. code:: console conda env create --file environment.yml conda activate pystempel-env pre-commit install To run tests: .. code:: console curl https://repo1.maven.org/maven2/org/apache/lucene/lucene-analyzers-stempel/8.1.1/lucene-analyzers-stempel-8.1.1.jar > stempel-8.1.1.jar python -m pytest ./ To run benchmark: .. code:: console set PYTHONPATH=%PYTHONPATH%;%cd% python tests\test_benchmark.py Licensing --------- * **Code**: Most of the code is covered by `Egothor`_ Open Source License, an Apache-style license. The rest of the code is covered by the `Apache License 2.0`_. This should be clear from a preamble of each file. * **Data**: * The original pretrained stemming table is covered by `Apache License 2.0`_. * The new pretrained stemming table is covered by `2-Clause BSD License`_, similarly to the `Polimorf dictionary` it has been derived from. The copyright owner of both the stemming table and the dictionary is `Institute of Computer Science at Polish Academy of Science`_ (IPI PAN). * Polish dictionary used by the unit tests comes from `sjp.pl`_ and is covered by `Apache License 2.0`_ as well. .. _Egothor: https://www.egothor.org/product/egothor2/ .. _Apache License 2.0: https://www.apache.org/licenses/LICENSE-2.0 .. _Polimorf dictionary: dicts/ .. _2-Clause BSD License: data/polimorf/LICENSE.txt .. _Institute of Computer Science at Polish Academy of Science: https://ipipan.waw.pl/en/ Alternatives ------------ * `Estem`_ is Erlang wrapper (not port) for Stempel stemmer. * `pl_stemmer`_ is a Python stemmer based on Porter's Algorithm. * `polish-stem`_ is a Python stemmer using Finite State Transducers. .. _Estem: https://github.com/arcusfelis/estem .. _pl_stemmer: https://github.com/Tutanchamon/pl_stemmer .. _polish-stem: https://github.com/eugeniashurko/polish-stem running dist_info creating /builddir/build/BUILD/pystempel-1.2.0/.pyproject-builddir/pip-modern-metadata-qm8ehskl/pystempel.egg-info writing /builddir/build/BUILD/pystempel-1.2.0/.pyproject-builddir/pip-modern-metadata-qm8ehskl/pystempel.egg-info/PKG-INFO writing dependency_links to /builddir/build/BUILD/pystempel-1.2.0/.pyproject-builddir/pip-modern-metadata-qm8ehskl/pystempel.egg-info/dependency_links.txt writing requirements to /builddir/build/BUILD/pystempel-1.2.0/.pyproject-builddir/pip-modern-metadata-qm8ehskl/pystempel.egg-info/requires.txt writing top-level names to /builddir/build/BUILD/pystempel-1.2.0/.pyproject-builddir/pip-modern-metadata-qm8ehskl/pystempel.egg-info/top_level.txt writing manifest file '/builddir/build/BUILD/pystempel-1.2.0/.pyproject-builddir/pip-modern-metadata-qm8ehskl/pystempel.egg-info/SOURCES.txt' reading manifest file '/builddir/build/BUILD/pystempel-1.2.0/.pyproject-builddir/pip-modern-metadata-qm8ehskl/pystempel.egg-info/SOURCES.txt' writing manifest file '/builddir/build/BUILD/pystempel-1.2.0/.pyproject-builddir/pip-modern-metadata-qm8ehskl/pystempel.egg-info/SOURCES.txt' creating '/builddir/build/BUILD/pystempel-1.2.0/.pyproject-builddir/pip-modern-metadata-qm8ehskl/pystempel.dist-info' Preparing metadata (pyproject.toml): finished with status 'done' Building wheels for collected packages: pystempel Building wheel for pystempel (pyproject.toml): started Running command Building wheel for pystempel (pyproject.toml) Stempel Stemmer =============== .. image:: https://badge.fury.io/py/pystempel.svg :target: https://badge.fury.io/py/pystempel Python port of Stempel, an algorithmic stemmer for Polish language, originally written in Java. The original stemmer has been implemented as part of `Egothor Project`_, taken virtually unchanged to `Stempel Stemmer Java library`_ by Andrzej Białecki and next included as part of `Apache Lucene`_, a free and open-source search engine library. It is also used by `Elastic Search`_ search engine. .. _Egothor Project: https://www.egothor.org/product/egothor2/ .. _Stempel Stemmer Java library: http://www.getopt.org/stempel/index.html .. _Apache Lucene: https://lucene.apache.org/core/3_1_0/api/contrib-stempel/index.html .. _Elastic Search: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-stempel.html This package includes also high-quality stemming tables for Polish: original one pretrained by Andrzej Białecki on 20,000 training sets, and new one, pretrained on 259,080 training sets from Polimorf dictionary by me. The port does not include code for compiling stemming tables. .. _sjp.pl: https://sjp.pl/slownik/en/ How to use ---------- Install in your local environment: .. code:: console pip install pystempel Use in your code: .. code:: python >>> from stempel import StempelStemmer Choose original (called default) version of a stemmer: .. code:: python >>> stemmer = StempelStemmer.default() or a version with new stemming table pretrained on training sets from Polimorf dictionary: .. code:: python >>> stemmer = StempelStemmer.polimorf() Stem: .. code:: python >>> for word in ['książka', 'książki', 'książkami', 'książkowa', 'książkowymi']: ... print(stemmer.stem(word)) ... książek książek książek książkowy książkowy Choosing stemming table ----------------------- Performance between original (default) and new stemming table (Polimorf-based) varies significantly. The stemmer for the default stemming table is *understemming*, i.e., for multiple forms of the same lemma provides different stems more often (63%) than when using Polimorf-based stemming table (13%). However, the file footprint of the latter is bigger (2.2MB vs 0.3MB). Also loading takes longer (7.5 seconds vs. 1.3 seconds), though this happens only once, when a stemmer is created. Also, for original stemming table, the stemmer stems slightly faster: ~60000 vs ~51000 words per second. See `Evaluation Jupyter Notebook`_ for the detailed evaluation results. .. _Evaluation Jupyter Notebook: http://htmlpreview.github.io/?https://github.com/dzieciou/pystempel/blob/master/Evaluation.html Note also, that the licensing schema of both stemming tables differs, and hence licensing of data generated with each one. See "Licensing" section for the details. Choosing between port and wrapper --------------------------------- If you work on an NLP project in Python you can choose between Python port and Python wrapper. Python port is what pystempel tries to achieve: translation from Java implementation to Python. Python wrapper is what I used in `tests`_: Python functions to call the original Java implementation of stemmer. You can find more about wrappers and ports in `Stackoverflow comparision post`_. Here, I compare both approaches to help you decide: * **Same accuracy**. I have verified Python port by comparing its output with output of original Java implementation for 331224 words from Free Polish dictionary (`sjp.pl`_) and for 100% of words it returns same output. * **Similar performance**. For mentioned dataset both stemmer versions achieved comparable performance. Python port completed stemming in 4.4 seconds, while Python wrapper -- in 5 seconds (Intel Core i5-6000 3.30 GHz, 16GB RAM, Windows 10, OpenJDK) * **Different setup**. Python wrapper requires additionally installation of Cython and pyjnius. Python wrapper will make also `debugging harder`_ (switching between two programming languages). .. _Stackoverflow comparision post: https://stackoverflow.com/questions/10113218/how-to-decide-when-to-wrap-port-write-from-scratch .. _debugging harder: https://stackoverflow.com/questions/6970359/find-an-efficient-way-to-integrate-different-language-libraries-into-one-project .. _tests: tests/ Options ------- To disable a progress bar when loading stemming tables, set environment variable ``DISABLE_TQDM=True``. Development setup ----------------- To setup environment for development you will need `Anaconda`_ installed. .. _Anaconda: https://anaconda.org/ .. code:: console conda env create --file environment.yml conda activate pystempel-env pre-commit install To run tests: .. code:: console curl https://repo1.maven.org/maven2/org/apache/lucene/lucene-analyzers-stempel/8.1.1/lucene-analyzers-stempel-8.1.1.jar > stempel-8.1.1.jar python -m pytest ./ To run benchmark: .. code:: console set PYTHONPATH=%PYTHONPATH%;%cd% python tests\test_benchmark.py Licensing --------- * **Code**: Most of the code is covered by `Egothor`_ Open Source License, an Apache-style license. The rest of the code is covered by the `Apache License 2.0`_. This should be clear from a preamble of each file. * **Data**: * The original pretrained stemming table is covered by `Apache License 2.0`_. * The new pretrained stemming table is covered by `2-Clause BSD License`_, similarly to the `Polimorf dictionary` it has been derived from. The copyright owner of both the stemming table and the dictionary is `Institute of Computer Science at Polish Academy of Science`_ (IPI PAN). * Polish dictionary used by the unit tests comes from `sjp.pl`_ and is covered by `Apache License 2.0`_ as well. .. _Egothor: https://www.egothor.org/product/egothor2/ .. _Apache License 2.0: https://www.apache.org/licenses/LICENSE-2.0 .. _Polimorf dictionary: dicts/ .. _2-Clause BSD License: data/polimorf/LICENSE.txt .. _Institute of Computer Science at Polish Academy of Science: https://ipipan.waw.pl/en/ Alternatives ------------ * `Estem`_ is Erlang wrapper (not port) for Stempel stemmer. * `pl_stemmer`_ is a Python stemmer based on Porter's Algorithm. * `polish-stem`_ is a Python stemmer using Finite State Transducers. .. _Estem: https://github.com/arcusfelis/estem .. _pl_stemmer: https://github.com/Tutanchamon/pl_stemmer .. _polish-stem: https://github.com/eugeniashurko/polish-stem running bdist_wheel running build running build_py creating build creating build/lib creating build/lib/stempel copying stempel/streams.py -> build/lib/stempel copying stempel/egothor.py -> build/lib/stempel copying stempel/__init__.py -> build/lib/stempel installing to build/bdist.linux-x86_64/wheel running install running install_lib creating build/bdist.linux-x86_64 creating build/bdist.linux-x86_64/wheel creating build/bdist.linux-x86_64/wheel/stempel copying build/lib/stempel/__init__.py -> build/bdist.linux-x86_64/wheel/stempel copying build/lib/stempel/egothor.py -> build/bdist.linux-x86_64/wheel/stempel copying build/lib/stempel/streams.py -> build/bdist.linux-x86_64/wheel/stempel running install_egg_info running egg_info creating pystempel.egg-info writing pystempel.egg-info/PKG-INFO writing dependency_links to pystempel.egg-info/dependency_links.txt writing requirements to pystempel.egg-info/requires.txt writing top-level names to pystempel.egg-info/top_level.txt writing manifest file 'pystempel.egg-info/SOURCES.txt' reading manifest file 'pystempel.egg-info/SOURCES.txt' writing manifest file 'pystempel.egg-info/SOURCES.txt' Copying pystempel.egg-info to build/bdist.linux-x86_64/wheel/pystempel-1.2.0-py3.9.egg-info running install_scripts creating build/bdist.linux-x86_64/wheel/pystempel-1.2.0.dist-info/WHEEL creating '/builddir/build/BUILD/pystempel-1.2.0/.pyproject-builddir/pip-wheel-t1mq4g0t/tmp430wqcb_/pystempel-1.2.0-py3-none-any.whl' and adding 'build/bdist.linux-x86_64/wheel' to it adding 'stempel/__init__.py' adding 'stempel/egothor.py' adding 'stempel/streams.py' adding 'pystempel-1.2.0.dist-info/METADATA' adding 'pystempel-1.2.0.dist-info/WHEEL' adding 'pystempel-1.2.0.dist-info/top_level.txt' adding 'pystempel-1.2.0.dist-info/RECORD' removing build/bdist.linux-x86_64/wheel Building wheel for pystempel (pyproject.toml): finished with status 'done' Created wheel for pystempel: filename=pystempel-1.2.0-py3-none-any.whl size=14829 sha256=b9252e993a85a528545d661eb9cc1acbd097d2e1f50e51813ae74ad2e30ce28e Stored in directory: /builddir/.cache/pip/wheels/e1/6a/ed/181597e97647e696fa06241b892069e9b4ba57a7bd0f783898 Successfully built pystempel + RPM_EC=0 ++ jobs -p + exit 0 Executing(%install): /bin/sh -e /var/tmp/rpm-tmp.MKeypI + umask 022 + cd /builddir/build/BUILD + '[' /builddir/build/BUILDROOT/python-pystempel-1.2.0-1.el9.x86_64 '!=' / ']' + rm -rf /builddir/build/BUILDROOT/python-pystempel-1.2.0-1.el9.x86_64 ++ dirname /builddir/build/BUILDROOT/python-pystempel-1.2.0-1.el9.x86_64 + mkdir -p /builddir/build/BUILDROOT + mkdir /builddir/build/BUILDROOT/python-pystempel-1.2.0-1.el9.x86_64 + cd pystempel-1.2.0 ++ ls /builddir/build/BUILD/pystempel-1.2.0/pyproject-wheeldir/pystempel-1.2.0-py3-none-any.whl ++ sed -E 's/([^-]+)-([^-]+)-.+\.whl/\1==\2/' ++ xargs basename --multiple + specifier=pystempel==1.2.0 + TMPDIR=/builddir/build/BUILD/pystempel-1.2.0/.pyproject-builddir + /usr/bin/python3 -m pip install --root /builddir/build/BUILDROOT/python-pystempel-1.2.0-1.el9.x86_64 --no-deps --disable-pip-version-check --progress-bar off --verbose --ignore-installed --no-warn-script-location --no-index --no-cache-dir --find-links /builddir/build/BUILD/pystempel-1.2.0/pyproject-wheeldir pystempel==1.2.0 Using pip 22.0.4 from /usr/lib/python3.9/site-packages/pip (python 3.9) Looking in links: /builddir/build/BUILD/pystempel-1.2.0/pyproject-wheeldir Processing ./pyproject-wheeldir/pystempel-1.2.0-py3-none-any.whl Installing collected packages: pystempel Successfully installed pystempel-1.2.0 + '[' -d /builddir/build/BUILDROOT/python-pystempel-1.2.0-1.el9.x86_64/usr/bin ']' + rm -f /builddir/build/BUILD/pyproject-ghost-distinfo + site_dirs=() + '[' -d /builddir/build/BUILDROOT/python-pystempel-1.2.0-1.el9.x86_64/usr/lib/python3.9/site-packages ']' + site_dirs+=("/usr/lib/python3.9/site-packages") + '[' /builddir/build/BUILDROOT/python-pystempel-1.2.0-1.el9.x86_64/usr/lib64/python3.9/site-packages '!=' /builddir/build/BUILDROOT/python-pystempel-1.2.0-1.el9.x86_64/usr/lib/python3.9/site-packages ']' + '[' -d /builddir/build/BUILDROOT/python-pystempel-1.2.0-1.el9.x86_64/usr/lib64/python3.9/site-packages ']' + for site_dir in ${site_dirs[@]} + for distinfo in /builddir/build/BUILDROOT/python-pystempel-1.2.0-1.el9.x86_64$site_dir/*.dist-info + echo '%ghost /usr/lib/python3.9/site-packages/pystempel-1.2.0.dist-info' + sed -i s/pip/rpm/ /builddir/build/BUILDROOT/python-pystempel-1.2.0-1.el9.x86_64/usr/lib/python3.9/site-packages/pystempel-1.2.0.dist-info/INSTALLER + PYTHONPATH=/usr/lib/rpm/redhat + /usr/bin/python3 -B /usr/lib/rpm/redhat/pyproject_preprocess_record.py --buildroot /builddir/build/BUILDROOT/python-pystempel-1.2.0-1.el9.x86_64 --record /builddir/build/BUILDROOT/python-pystempel-1.2.0-1.el9.x86_64/usr/lib/python3.9/site-packages/pystempel-1.2.0.dist-info/RECORD --output /builddir/build/BUILD/pyproject-record + rm -fv /builddir/build/BUILDROOT/python-pystempel-1.2.0-1.el9.x86_64/usr/lib/python3.9/site-packages/pystempel-1.2.0.dist-info/RECORD removed '/builddir/build/BUILDROOT/python-pystempel-1.2.0-1.el9.x86_64/usr/lib/python3.9/site-packages/pystempel-1.2.0.dist-info/RECORD' + rm -fv /builddir/build/BUILDROOT/python-pystempel-1.2.0-1.el9.x86_64/usr/lib/python3.9/site-packages/pystempel-1.2.0.dist-info/REQUESTED removed '/builddir/build/BUILDROOT/python-pystempel-1.2.0-1.el9.x86_64/usr/lib/python3.9/site-packages/pystempel-1.2.0.dist-info/REQUESTED' ++ wc -l /builddir/build/BUILD/pyproject-ghost-distinfo ++ cut -f1 '-d ' + lines=1 + '[' 1 -ne 1 ']' + /usr/bin/python3 /usr/lib/rpm/redhat/pyproject_save_files.py --output-files /builddir/build/BUILD/pyproject-files --output-modules /builddir/build/BUILD/pyproject-modules --buildroot /builddir/build/BUILDROOT/python-pystempel-1.2.0-1.el9.x86_64 --sitelib /usr/lib/python3.9/site-packages --sitearch /usr/lib64/python3.9/site-packages --python-version 3.9 --pyproject-record /builddir/build/BUILD/pyproject-record --prefix /usr '*' +auto + /usr/lib/rpm/find-debuginfo.sh -j2 --strict-build-id -m -i --build-id-seed 1.2.0-1.el9 --unique-debug-suffix -1.2.0-1.el9.x86_64 --unique-debug-src-base python-pystempel-1.2.0-1.el9.x86_64 --run-dwz --dwz-low-mem-die-limit 10000000 --dwz-max-die-limit 110000000 -S debugsourcefiles.list /builddir/build/BUILD/pystempel-1.2.0 find: 'debug': No such file or directory + /usr/lib/rpm/check-buildroot + /usr/lib/rpm/redhat/brp-ldconfig + /usr/lib/rpm/brp-compress + /usr/lib/rpm/redhat/brp-strip-lto /usr/bin/strip + /usr/lib/rpm/brp-strip-static-archive /usr/bin/strip + /usr/lib/rpm/redhat/brp-python-bytecompile '' 1 0 Bytecompiling .py files below /builddir/build/BUILDROOT/python-pystempel-1.2.0-1.el9.x86_64/usr/lib/python3.9 using python3.9 + /usr/lib/rpm/brp-python-hardlink + /usr/lib/rpm/redhat/brp-mangle-shebangs Executing(%check): /bin/sh -e /var/tmp/rpm-tmp.NPPpcx + umask 022 + cd /builddir/build/BUILD + cd pystempel-1.2.0 + '[' '!' -f /builddir/build/BUILD/pyproject-modules ']' + PATH=/builddir/build/BUILDROOT/python-pystempel-1.2.0-1.el9.x86_64/usr/bin:/builddir/.local/bin:/builddir/bin:/usr/bin:/bin:/usr/sbin:/sbin:/usr/local/sbin + PYTHONPATH=/builddir/build/BUILDROOT/python-pystempel-1.2.0-1.el9.x86_64/usr/lib64/python3.9/site-packages:/builddir/build/BUILDROOT/python-pystempel-1.2.0-1.el9.x86_64/usr/lib/python3.9/site-packages + _PYTHONSITE=/builddir/build/BUILDROOT/python-pystempel-1.2.0-1.el9.x86_64/usr/lib64/python3.9/site-packages:/builddir/build/BUILDROOT/python-pystempel-1.2.0-1.el9.x86_64/usr/lib/python3.9/site-packages + PYTHONDONTWRITEBYTECODE=1 + /usr/bin/python3 -s /usr/lib/rpm/redhat/import_all_modules.py -f /builddir/build/BUILD/pyproject-modules -t Check import: stempel + RPM_EC=0 ++ jobs -p + exit 0 Processing files: python3-pystempel-1.2.0-1.el9.noarch Provides: python-pystempel = 1.2.0-1.el9 python3-pystempel = 1.2.0-1.el9 python3.9-pystempel = 1.2.0-1.el9 python3.9dist(pystempel) = 1.2 python3dist(pystempel) = 1.2 Requires(rpmlib): rpmlib(CompressedFileNames) <= 3.0.4-1 rpmlib(FileDigests) <= 4.6.0-1 rpmlib(PartialHardlinkSets) <= 4.0.4-1 rpmlib(PayloadFilesHavePrefix) <= 4.0-1 Requires: python(abi) = 3.9 python3.9dist(sortedcontainers) python3.9dist(tqdm) Obsoletes: python39-pystempel < 1.2.0-1.el9 Checking for unpackaged file(s): /usr/lib/rpm/check-files /builddir/build/BUILDROOT/python-pystempel-1.2.0-1.el9.x86_64 Wrote: /builddir/build/SRPMS/python-pystempel-1.2.0-1.el9.src.rpm Wrote: /builddir/build/RPMS/python3-pystempel-1.2.0-1.el9.noarch.rpm Executing(%clean): /bin/sh -e /var/tmp/rpm-tmp.Iiodgr + umask 022 + cd /builddir/build/BUILD + cd pystempel-1.2.0 + /usr/bin/rm -rf /builddir/build/BUILDROOT/python-pystempel-1.2.0-1.el9.x86_64 + RPM_EC=0 ++ jobs -p + exit 0 Child return code was: 0