From 190dc15982579e97306f15da377954a695aaab4f Mon Sep 17 00:00:00 2001 From: Olamyy Date: Sat, 6 Jul 2019 09:49:40 +0100 Subject: [PATCH 1/9] Intial Documentation Commit --- README.md | 65 ----------------------------- README.rst | 111 +++++++++++++++++++++++++++++++++++++++++++++++++ docs/Makefile | 20 +++++++++ docs/conf.py | 54 ++++++++++++++++++++++++ docs/index.rst | 10 +++++ docs/make.bat | 35 ++++++++++++++++ 6 files changed, 230 insertions(+), 65 deletions(-) delete mode 100644 README.md create mode 100644 README.rst create mode 100644 docs/Makefile create mode 100644 docs/conf.py create mode 100644 docs/index.rst create mode 100644 docs/make.bat diff --git a/README.md b/README.md deleted file mode 100644 index feb283d..0000000 --- a/README.md +++ /dev/null @@ -1,65 +0,0 @@ -# Ìrànlọ́wọ́ -[![Build Status](https://travis-ci.org/Niger-Volta-LTI/iranlowo.svg?branch=master)](https://travis-ci.org/Niger-Volta-LTI/iranlowo) -[![PyPI](https://img.shields.io/pypi/v/iranlowo.svg)](https://pypi.org/project/iranlowo) -![PyPI - Python Version](https://img.shields.io/pypi/pyversions/iranlowo.svg) -[![License](https://black.readthedocs.io/en/stable/_static/license.svg)](https://github.com/ruohoruotsi/iranlowo/blob/master/LICENSE) -[![Style](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/ambv/black) - -Ìrànlọ́wọ́ is a set of utilities to analyze & process Yorùbá text for NLP tasks. The focus is on *helping software developers* build large, clean text datasets for (further) diacritic restoration and machine translation tasks. - -## Features - -### ADR tools -* [X] Strip all diacritics from word-types -* [X] Verify that text is NFC or NFD -* [X] Canonicalize a corpus (from MS Word or elsewhere) → NFC -* [X] Split long sentences on certain characters like `;`,`:`, etc -* [X] Automatically restore correct diacritics using a pre-trained model -* [X] Find all variants of all word-type in a given corpus -* [ ] Partially strip diacritics from word-types - -### Ready to use webpage scrapers -* [X] Bíbélì Mímọ́ -* [X] Yoruba Bible - Bible Society of Nigeria -* [ ] Yorùbá Blog -* [ ] BBC Yorùbá - -### Corpus analysis tools -* [X] Dataset character distribution -* [X] Dataset ambuiguity statistics → Lexdif, etc for a given corpus -* [ ] Dataset scoring (proximity to correctly diacritized text, LM perplexity, KL divergence) - -## Installation -Obtainable from the [Python Package Index (PyPI)](https://pypi.org/project/iranlowo/) → `pip install iranlowo` - -## Example - -* Show computing environment and installation process - - - -* Diacritize a phrase -``` -$ python -Python 3.7.3 (default, Mar 27 2019, 16:54:48) -[Clang 4.0.1 (tags/RELEASE_401/final)] :: Anaconda, Inc. on darwin -Type "help", "copyright", "credits" or "license" for more information. ->>> import iranlowo.adr as ránlọ ->>> ránlọ.diacritize_text("lootoo ni pe ojo gbogbo ni ti ole") -PRED AVG SCORE: -0.0037, PRED PPL: 1.0037 -'lóòtóọ́ ni pé ọjọ́ gbogbo ni ti olè' -``` - -* Diacritize phrases, note we use `ipython` only because it renders nicer, easy-to-read text-colours in the terminal! - - - -## Disclaimer - -This is beta software, if you pass the diacritizer [out-of-domain text](https://www.quora.com/What-is-in-domain-out-domain-and-open-domain-data), English, pidgin or any other non-Yorùbá text, you will experience very marvelous, black-box results. - -Since this a work-in-progress and we are steadily improving, if you encounter any problems with correctness or performance, please submit [pull-requests](https://github.com/ruohoruotsi/iranlowo/pulls) with corrections or file an [issue](https://github.com/ruohoruotsi/iranlowo/issues). - -## License - -This project is licensed under the [MIT License](https://opensource.org/licenses/MIT). \ No newline at end of file diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..a67b8c1 --- /dev/null +++ b/README.rst @@ -0,0 +1,111 @@ +.. role:: raw-html-m2r(raw) + :format: html + + +Ìrànlọ́wọ́ +======== + + +.. image:: https://travis-ci.org/Niger-Volta-LTI/iranlowo.svg?branch=master + :target: https://travis-ci.org/Niger-Volta-LTI/iranlowo + :alt: Build Status + + +.. image:: https://img.shields.io/pypi/v/iranlowo.svg + :target: https://pypi.org/project/iranlowo + :alt: PyPI + + +.. image:: https://img.shields.io/pypi/pyversions/iranlowo.svg + :target: https://img.shields.io/pypi/pyversions/iranlowo.svg + :alt: PyPI - Python Version + + +.. image:: https://black.readthedocs.io/en/stable/_static/license.svg + :target: https://github.com/ruohoruotsi/iranlowo/blob/master/LICENSE + :alt: License + + +.. image:: https://img.shields.io/badge/code%20style-black-000000.svg + :target: https://github.com/ambv/black + :alt: Style + + +Ìrànlọ́wọ́ is a set of utilities to analyze & process Yorùbá text for NLP tasks. The focus is on *helping software developers* build large, clean text datasets for (further) diacritic restoration and machine translation tasks. + +Features +-------- + +ADR tools +^^^^^^^^^ + + +* [X] Strip all diacritics from word-types +* [X] Verify that text is NFC or NFD +* [X] Canonicalize a corpus (from MS Word or elsewhere) → NFC +* [X] Split long sentences on certain characters like ``;``\ ,\ ``:``\ , etc +* [X] Automatically restore correct diacritics using a pre-trained model +* [X] Find all variants of all word-type in a given corpus +* [ ] Partially strip diacritics from word-types + +Ready to use webpage scrapers +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + +* [X] Bíbélì Mímọ́ +* [X] Yoruba Bible - Bible Society of Nigeria +* [ ] Yorùbá Blog +* [ ] BBC Yorùbá + +Corpus analysis tools +^^^^^^^^^^^^^^^^^^^^^ + + +* [X] Dataset character distribution +* [X] Dataset ambuiguity statistics → Lexdif, etc for a given corpus +* [ ] Dataset scoring (proximity to correctly diacritized text, LM perplexity, KL divergence) + +Installation +------------ + +Obtainable from the `Python Package Index (PyPI) `_ → ``pip install iranlowo`` + +Example +------- + + +* Show computing environment and installation process + +:raw-html-m2r:`` + + +* + Diacritize a phrase + + .. code-block:: + + $ python + Python 3.7.3 (default, Mar 27 2019, 16:54:48) + [Clang 4.0.1 (tags/RELEASE_401/final)] :: Anaconda, Inc. on darwin + Type "help", "copyright", "credits" or "license" for more information. + >>> import iranlowo.adr as ránlọ + >>> ránlọ.diacritize_text("lootoo ni pe ojo gbogbo ni ti ole") + PRED AVG SCORE: -0.0037, PRED PPL: 1.0037 + 'lóòtóọ́ ni pé ọjọ́ gbogbo ni ti olè' + +* + Diacritize phrases, note we use ``ipython`` only because it renders nicer, easy-to-read text-colours in the terminal! + +:raw-html-m2r:`` + +Disclaimer +---------- + +This is beta software, if you pass the diacritizer `out-of-domain text `_\ , English, pidgin or any other non-Yorùbá text, you will experience very marvelous, black-box results. + +Since this a work-in-progress and we are steadily improving, if you encounter any problems with correctness or performance, please submit `pull-requests `_ with corrections or file an `issue `_. + +License +------- + +This project is licensed under the `MIT License `_. diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d4bb2cb --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..4f4c978 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,54 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# http://www.sphinx-doc.org/en/master/config + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + + +# -- Project information ----------------------------------------------------- + +project = 'Iranlowo' +copyright = '2019, Ruoho Ruosi , Olamilekan Wahab' +author = 'Ruoho Ruosi , Olamilekan Wahab' + +# The full version, including alpha/beta/rc tags +release = '0.1' + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = ['recommonmark'] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'alabaster' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..26780cb --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,10 @@ +.. Iranlowo documentation master file, created by + sphinx-quickstart on Sat Jul 6 09:15:49 2019. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +.. include:: ../README.rst + +.. toctree:: + :maxdepth: 1 + :caption: Contents: diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..922152e --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd From ab0c612ec4f0e8f2d4fddcb1166030d38b8c675b Mon Sep 17 00:00:00 2001 From: Olamyy Date: Fri, 9 Aug 2019 13:25:02 +0100 Subject: [PATCH 2/9] README.md --- src/iranlowo/interfaces.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 src/iranlowo/interfaces.py diff --git a/src/iranlowo/interfaces.py b/src/iranlowo/interfaces.py new file mode 100644 index 0000000..5ba518e --- /dev/null +++ b/src/iranlowo/interfaces.py @@ -0,0 +1,14 @@ +import scrapy + + +class Scrapper(scrapy.Spider): + """ + Interface for scrapping data from :mod:`iranlowo.scrapper` + """ + + def __init__(self, name, urls, **kwargs): + super(Scrapper, self).__init__(name, **kwargs) + + def parse(self, response): + pass + From 17b8e18a0ee89dadbb6a1f702c0d7569d4255f51 Mon Sep 17 00:00:00 2001 From: Olamyy Date: Thu, 14 Nov 2019 10:33:15 +0100 Subject: [PATCH 3/9] Introducd ngram tokenizer --- src/iranlowo/tokenizer.py | 84 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 src/iranlowo/tokenizer.py diff --git a/src/iranlowo/tokenizer.py b/src/iranlowo/tokenizer.py new file mode 100644 index 0000000..d09560d --- /dev/null +++ b/src/iranlowo/tokenizer.py @@ -0,0 +1,84 @@ +import gensim + + +class Tokenizer(object): + def __init__(self, text, model=None, symbol=' ', func=None): + """ + + Args: + text: + model: + symbol: + func: + """ + self.text = text + self.symbol = symbol + self.func = func + self.model = model + + def ngram_tokenize(self): + pass + + def word_tokenize(self, symbol=None, map_entities=False): + if map_entities: + email, num, link, abb = "", "", "", "" + if not symbol: + tokens = gensim.utils.simple_tokenize(self.text) + else: + tokens = [x for x in self.text] + + def sentence_tokenize(self, min_words_to_split=10, min_words_in_utt=5): + output = [] + for line in self.text.splitlines(): + if self.symbol in line: + num_words = len(line.split()) + num_commas = line.count(self.symbol) + curr_comma_position = line.index(self.symbol) + num_words_ahead_of_curr_comma = len(line[0:curr_comma_position].split()) + + curr_line = line + while num_commas > 0: + if num_words < min_words_to_split: + # print(curr_line.strip()) + output.append(curr_line) + break + if num_words >= min_words_to_split: + if ( + num_words_ahead_of_curr_comma >= min_words_in_utt + and len(curr_line[curr_comma_position:].split()) + >= min_words_in_utt + ): + output.append(curr_line[0:curr_comma_position] + "\n") + + # update vars + curr_line = curr_line[curr_comma_position + 1:] + num_words = len(curr_line.split()) + num_commas = num_commas - 1 + if num_commas > 0: + curr_comma_position = curr_line.index(self.symbol) + num_words_ahead_of_curr_comma = len( + curr_line[0:curr_comma_position].split() + ) + else: + output.append(curr_line) + else: + # ignore too short comma (+= vs = on current comma position) + num_commas = num_commas - 1 + if num_commas > 0: # for say 3 commas + curr_comma_position += ( + curr_line[curr_comma_position + 1:].index(self.symbol) + + 1 + ) + num_words_ahead_of_curr_comma = len( + curr_line[0:curr_comma_position].split() + ) + else: + output.append(curr_line) + else: + output.append(curr_line) + else: + output.append(line) + return output + + def morph_tokenize(self): + pass From 1feb123988a8afac3ac53c7acfb72df862c4bc18 Mon Sep 17 00:00:00 2001 From: Olamyy Date: Sun, 15 Dec 2019 20:31:25 +0100 Subject: [PATCH 4/9] Reworking corpus --- src/iranlowo/corpus/{scrapper.py => corpus.yml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/iranlowo/corpus/{scrapper.py => corpus.yml} (100%) diff --git a/src/iranlowo/corpus/scrapper.py b/src/iranlowo/corpus/corpus.yml similarity index 100% rename from src/iranlowo/corpus/scrapper.py rename to src/iranlowo/corpus/corpus.yml From 5cc9ae583faffc2386c9799c82a9d4f378a701ce Mon Sep 17 00:00:00 2001 From: Olamyy Date: Sun, 15 Dec 2019 22:58:21 +0100 Subject: [PATCH 5/9] Cleaned the corpus module. The loader is now a function. Added tests. --- requirements.txt | 1 + src/iranlowo/corpus/__init__.py | 4 +- src/iranlowo/corpus/corpus.py | 36 +++++++++++++++++ src/iranlowo/corpus/corpus.yml | 13 ++++++ src/iranlowo/corpus/loaders.py | 71 +++++---------------------------- tests/test_loaders.py | 16 +++++--- 6 files changed, 73 insertions(+), 68 deletions(-) diff --git a/requirements.txt b/requirements.txt index 5441c7f..aa90338 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +PyYAML bs4 configargparse torch diff --git a/src/iranlowo/corpus/__init__.py b/src/iranlowo/corpus/__init__.py index 4201d4e..9af9807 100644 --- a/src/iranlowo/corpus/__init__.py +++ b/src/iranlowo/corpus/__init__.py @@ -1,2 +1,2 @@ -from .corpus import Corpus, DirectoryCorpus -from .loaders import OweLoader, YorubaBlogCorpus, BBCCorpus, BibeliCorpus \ No newline at end of file +from .corpus import Corpus, DirectoryCorpus, get_corpus, get_corpus_path, download_corpus + diff --git a/src/iranlowo/corpus/corpus.py b/src/iranlowo/corpus/corpus.py index 81fa2ff..01b99ca 100644 --- a/src/iranlowo/corpus/corpus.py +++ b/src/iranlowo/corpus/corpus.py @@ -1,6 +1,7 @@ import gzip import os +import yaml from gensim import interfaces from gensim.corpora.csvcorpus import CsvCorpus from gensim.corpora.textcorpus import walk @@ -130,3 +131,38 @@ def read_files(self): if self.depth <= depth: for path in filenames: yield os.path.join(dirpath, path) + + +def get_corpus(name, niger_volta=False, **kwargs): + def file_or_dir(path, mode): + if mode == "single": + return Corpus(path=path, **kwargs) + else: + return DirectoryCorpus(path=path, **kwargs) + + with open(os.path.join(os.path.dirname(__file__), 'corpus.yml'), 'r') as stream: + data = yaml.safe_load(stream) + if niger_volta: + nvc = data.get('niger_volta') + if name not in nvc.keys(): + raise ValueError("Corpus {} does not exist".format(name)) + else: + path = os.path.join(os.environ['NIGER_VOLTA_CORPUS'], nvc[name]['path']) + return file_or_dir(path, nvc[name]['mode']) + else: + path = os.path.join(os.path.dirname(__file__), 'corpus/{}'.format(data['path'])) + return file_or_dir(path, data['mode']) + + +def get_corpus_path(name): + with open(os.path.join(os.path.dirname(__file__), 'corpus.yml'), 'r') as stream: + data = yaml.safe_load(stream) + if name not in data.keys(): + raise ValueError("Corpus {} does not exist".format(name)) + else: + return os.path.join(os.path.dirname(__file__), data[name]) + + +def download_corpus(name, uri=None): + pass + diff --git a/src/iranlowo/corpus/corpus.yml b/src/iranlowo/corpus/corpus.yml index e69de29..93dc8d9 100644 --- a/src/iranlowo/corpus/corpus.yml +++ b/src/iranlowo/corpus/corpus.yml @@ -0,0 +1,13 @@ +niger_volta: + yoruba_blog: + path: "TheYorubaBlog/theyorubablog_dot_com.txt" + mode: single + owe_yoruba: + path: " " + mode: dir + quran_mimo: + path: " " + mode: dir + asubiaro: + path: " " + mode: single \ No newline at end of file diff --git a/src/iranlowo/corpus/loaders.py b/src/iranlowo/corpus/loaders.py index 1314af6..2d3e17d 100644 --- a/src/iranlowo/corpus/loaders.py +++ b/src/iranlowo/corpus/loaders.py @@ -1,66 +1,15 @@ import os -from iranlowo.corpus import Corpus, DirectoryCorpus +from iranlowo.corpus import get_corpus +os.environ['NIGER_VOLTA_CORPUS'] = "/Users/Olamilekan/Desktop/Machine Learning/OpenSource/yoruba-text" -class BaseLoader(object): - def __init__(self, corpus_path): - self.corpus_path = corpus_path - yoruba_text_path = os.environ.get("YORUBA_TEXT_PATH", None) - if not yoruba_text_path: - raise NotADirectoryError( - "YORUBA_TEXT_PATH environment variable not found. Please, clone the corpus repository from https://github.com/Niger-Volta-LTI/yoruba-text and set to YORUBA_TEXT_PATH to it's " - "path") - else: - corpus_path = "{}/{}".format(yoruba_text_path, corpus_path) - self.path = corpus_path - - -class YorubaBlogCorpus(Corpus): - def __init__(self, path): - """ - - Args: - path: - """ - super(YorubaBlogCorpus, self).__init__(path=self.path, **kwargs) - - -class BBCCorpus(Corpus): - def __init__(self, path): - """ - - Args: - path: - """ - super(BBCCorpus, self).__init__(path=self.path, **kwargs) - super().__init__(path) - - -class BibeliCorpus(Corpus): - def __init__(self, path): - """ - - Args: - path: - """ - super(BibeliCorpus, self).__init__(path=self.path, **kwargs) - - -class en(BaseLoader, DirectoryCorpus): - def __init__(self): - BaseLoader.__init__(self, corpus_path="Owe/en") - DirectoryCorpus.__init__(self, path=self.path) - - -class yo(BaseLoader, DirectoryCorpus): - def __init__(self): - BaseLoader.__init__(self, corpus_path="Owe/yo") - DirectoryCorpus.__init__(self, path=self.path) - - -class OweLoader(object): - def __init__(self): - self.en = en() - self.yo = yo() +def niger_volta_corpus(corpus_code): + nvc_path = os.environ.get("NIGER_VOLTA_CORPUS", None) + if not nvc_path: + raise NotADirectoryError( + "NIGER_VOLTA_CORPUS environment variable not found. Please, clone the corpus repository from https://github.com/Niger-Volta-LTI/yoruba-text and set to NIGER_VOLTA_CORPUS to it's " + "path") + else: + return get_corpus(name=corpus_code, niger_volta=True) diff --git a/tests/test_loaders.py b/tests/test_loaders.py index 7468a35..91f0264 100644 --- a/tests/test_loaders.py +++ b/tests/test_loaders.py @@ -1,12 +1,18 @@ import unittest -from iranlowo import corpus +from iranlowo.corpus import loaders, corpus class TestCoprusLoader(unittest.TestCase): - def setUp(self): - self.owe_loader = corpus.OweLoader + def test_load_yoruba_blog(self): + yb = loaders.niger_volta_corpus('yoruba_blog') + self.assertIsInstance(yb, corpus.Corpus) - def test_load_owe(self): + def test_load_owe_empty(self): with self.assertRaises(NotADirectoryError): - self.owe_loader() + loaders.niger_volta_corpus('owe_yoruba') + + def test_load_corpus_does_not_exist(self): + with self.assertRaises(ValueError): + loaders.niger_volta_corpus('owe') + From 82608b69231277495a254ed16fd51f75ef75e463 Mon Sep 17 00:00:00 2001 From: Olamyy Date: Sun, 15 Dec 2019 23:05:05 +0100 Subject: [PATCH 6/9] Introduced black for formatting --- src/iranlowo/corpus/__init__.py | 9 ++++-- src/iranlowo/corpus/corpus.py | 57 +++++++++++++++++++++++---------- src/iranlowo/corpus/loaders.py | 7 ++-- 3 files changed, 52 insertions(+), 21 deletions(-) diff --git a/src/iranlowo/corpus/__init__.py b/src/iranlowo/corpus/__init__.py index 9af9807..16a9c96 100644 --- a/src/iranlowo/corpus/__init__.py +++ b/src/iranlowo/corpus/__init__.py @@ -1,2 +1,7 @@ -from .corpus import Corpus, DirectoryCorpus, get_corpus, get_corpus_path, download_corpus - +from .corpus import ( + Corpus, + DirectoryCorpus, + get_corpus, + get_corpus_path, + download_corpus, +) diff --git a/src/iranlowo/corpus/corpus.py b/src/iranlowo/corpus/corpus.py index 01b99ca..e85679c 100644 --- a/src/iranlowo/corpus/corpus.py +++ b/src/iranlowo/corpus/corpus.py @@ -11,7 +11,16 @@ class Corpus(interfaces.CorpusABC): - def __init__(self, path=None, text=None, stream=False, fformat='txt', cformat=None, labels=False, preprocess=None): + def __init__( + self, + path=None, + text=None, + stream=False, + fformat="txt", + cformat=None, + labels=False, + preprocess=None, + ): """ Args: @@ -19,16 +28,23 @@ def __init__(self, path=None, text=None, stream=False, fformat='txt', cformat=No text: """ self.path = path + print(self.path) self.text = text self.labels = labels self.stream = stream self.fformat = fformat self.cformat = cformat self.preprocess = preprocess - assert self.path or self.text, "You should pass either a path or text to read data from." + assert ( + self.path or self.text + ), "You should pass either a path or text to read data from." if not self.preprocess: self.preprocess = [normalize_diacritics_text] - self.data = self.read_file_filename_or_text(text=text) if text else self.read_file_filename_or_text() + self.data = ( + self.read_file_filename_or_text(text=text) + if text + else self.read_file_filename_or_text() + ) self.validate_format() def __iter__(self): @@ -70,12 +86,14 @@ def read_file_filename_or_text(self, f=None, text=None): text = open(path) elif self.fformat == "csv": text = CsvCorpus(path, self.labels) - elif self.fformat == 'gzip': + elif self.fformat == "gzip": text = gzip.open(path) else: text = self.path.seek(0) - text = text.read() if not self.stream else ''.join(list(self.streamfile(text))) + text = ( + text.read() if not self.stream else "".join(list(self.streamfile(text))) + ) return self.handle_preprocessing(text) if self.preprocess else text def handle_preprocessing(self, text): @@ -94,12 +112,16 @@ def validate_format(self): """ data = self.data if isinstance(data, list): - data = ''.join(data) + data = "".join(data) if not self.cformat and not is_text_nfc(data): raise TypeError("The corpus does not comply to the NFC corpus format") elif self.cformat == "owe": if not is_valid_owé_format(data): - raise TypeError("The corpus does not comply to the {0} corpus format".format(self.cformat)) + raise TypeError( + "The corpus does not comply to the {0} corpus format".format( + self.cformat + ) + ) else: return True @@ -119,14 +141,16 @@ def generate(self, size): class DirectoryCorpus(Corpus): def __init__(self, path, **kwargs): self.dir_path = path - self.depth = kwargs.get('min_depth', 0) + self.depth = kwargs.get("min_depth", 0) self.path = list(self.read_files()) super(DirectoryCorpus, self).__init__(path=self.path, **kwargs) def read_files(self): walked = list(walk(self.dir_path)) if not walked: - raise NotADirectoryError("'{}' is not a valid directory".format(self.dir_path)) + raise NotADirectoryError( + "'{}' is not a valid directory".format(self.dir_path) + ) for depth, dirpath, _, filenames in walked: if self.depth <= depth: for path in filenames: @@ -140,22 +164,22 @@ def file_or_dir(path, mode): else: return DirectoryCorpus(path=path, **kwargs) - with open(os.path.join(os.path.dirname(__file__), 'corpus.yml'), 'r') as stream: + with open(os.path.join(os.path.dirname(__file__), "corpus.yml"), "r") as stream: data = yaml.safe_load(stream) if niger_volta: - nvc = data.get('niger_volta') + nvc = data.get("niger_volta") if name not in nvc.keys(): raise ValueError("Corpus {} does not exist".format(name)) else: - path = os.path.join(os.environ['NIGER_VOLTA_CORPUS'], nvc[name]['path']) - return file_or_dir(path, nvc[name]['mode']) + path = os.path.join(os.environ["NIGER_VOLTA_CORPUS"], nvc[name]["path"]) + return file_or_dir(path, nvc[name]["mode"]) else: - path = os.path.join(os.path.dirname(__file__), 'corpus/{}'.format(data['path'])) - return file_or_dir(path, data['mode']) + path = os.path.join(os.path.dirname(__file__), "corpus/{}".format(data["path"])) + return file_or_dir(path, data["mode"]) def get_corpus_path(name): - with open(os.path.join(os.path.dirname(__file__), 'corpus.yml'), 'r') as stream: + with open(os.path.join(os.path.dirname(__file__), "corpus.yml"), "r") as stream: data = yaml.safe_load(stream) if name not in data.keys(): raise ValueError("Corpus {} does not exist".format(name)) @@ -165,4 +189,3 @@ def get_corpus_path(name): def download_corpus(name, uri=None): pass - diff --git a/src/iranlowo/corpus/loaders.py b/src/iranlowo/corpus/loaders.py index 2d3e17d..1565ec1 100644 --- a/src/iranlowo/corpus/loaders.py +++ b/src/iranlowo/corpus/loaders.py @@ -2,7 +2,9 @@ from iranlowo.corpus import get_corpus -os.environ['NIGER_VOLTA_CORPUS'] = "/Users/Olamilekan/Desktop/Machine Learning/OpenSource/yoruba-text" +os.environ[ + "NIGER_VOLTA_CORPUS" +] = "/Users/Olamilekan/Desktop/Machine Learning/OpenSource/yoruba-text" def niger_volta_corpus(corpus_code): @@ -10,6 +12,7 @@ def niger_volta_corpus(corpus_code): if not nvc_path: raise NotADirectoryError( "NIGER_VOLTA_CORPUS environment variable not found. Please, clone the corpus repository from https://github.com/Niger-Volta-LTI/yoruba-text and set to NIGER_VOLTA_CORPUS to it's " - "path") + "path" + ) else: return get_corpus(name=corpus_code, niger_volta=True) From e3cace6ba4965917ace0b46e51110ccf40f3d7f5 Mon Sep 17 00:00:00 2001 From: Olamyy Date: Sun, 15 Dec 2019 23:14:16 +0100 Subject: [PATCH 7/9] Removed print statements --- src/iranlowo/corpus/corpus.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/iranlowo/corpus/corpus.py b/src/iranlowo/corpus/corpus.py index e85679c..0bdf3bf 100644 --- a/src/iranlowo/corpus/corpus.py +++ b/src/iranlowo/corpus/corpus.py @@ -28,7 +28,6 @@ def __init__( text: """ self.path = path - print(self.path) self.text = text self.labels = labels self.stream = stream From 5ee7a63d3324002f542e7921eb1fe7defcbe0748 Mon Sep 17 00:00:00 2001 From: Olamyy Date: Sun, 15 Dec 2019 23:32:04 +0100 Subject: [PATCH 8/9] Added NIGER_VOLTA_CORPUS folder check. --- src/iranlowo/corpus/corpus.py | 5 +++++ src/iranlowo/corpus/loaders.py | 9 +-------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/iranlowo/corpus/corpus.py b/src/iranlowo/corpus/corpus.py index 0bdf3bf..c4610e2 100644 --- a/src/iranlowo/corpus/corpus.py +++ b/src/iranlowo/corpus/corpus.py @@ -170,6 +170,11 @@ def file_or_dir(path, mode): if name not in nvc.keys(): raise ValueError("Corpus {} does not exist".format(name)) else: + if not os.environ.get("NIGER_VOLTA_CORPUS", None): + raise NotADirectoryError( + "NIGER_VOLTA_CORPUS environment variable not found. Please, clone the corpus repository from https://github.com/Niger-Volta-LTI/yoruba-text and set to NIGER_VOLTA_CORPUS to it's " + "path" + ) path = os.path.join(os.environ["NIGER_VOLTA_CORPUS"], nvc[name]["path"]) return file_or_dir(path, nvc[name]["mode"]) else: diff --git a/src/iranlowo/corpus/loaders.py b/src/iranlowo/corpus/loaders.py index 1565ec1..07e60a8 100644 --- a/src/iranlowo/corpus/loaders.py +++ b/src/iranlowo/corpus/loaders.py @@ -8,11 +8,4 @@ def niger_volta_corpus(corpus_code): - nvc_path = os.environ.get("NIGER_VOLTA_CORPUS", None) - if not nvc_path: - raise NotADirectoryError( - "NIGER_VOLTA_CORPUS environment variable not found. Please, clone the corpus repository from https://github.com/Niger-Volta-LTI/yoruba-text and set to NIGER_VOLTA_CORPUS to it's " - "path" - ) - else: - return get_corpus(name=corpus_code, niger_volta=True) + return get_corpus(name=corpus_code, niger_volta=True) From 7db0ec49d6d7728ae1fad5ddc5102c934ed336b9 Mon Sep 17 00:00:00 2001 From: Olamyy Date: Sun, 15 Dec 2019 23:50:38 +0100 Subject: [PATCH 9/9] Fixed failing tests --- src/iranlowo/corpus/loaders.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/iranlowo/corpus/loaders.py b/src/iranlowo/corpus/loaders.py index 07e60a8..e82a9ed 100644 --- a/src/iranlowo/corpus/loaders.py +++ b/src/iranlowo/corpus/loaders.py @@ -2,10 +2,6 @@ from iranlowo.corpus import get_corpus -os.environ[ - "NIGER_VOLTA_CORPUS" -] = "/Users/Olamilekan/Desktop/Machine Learning/OpenSource/yoruba-text" - def niger_volta_corpus(corpus_code): return get_corpus(name=corpus_code, niger_volta=True)