diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d4bb2cb --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..4f4c978 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,54 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# http://www.sphinx-doc.org/en/master/config + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + + +# -- Project information ----------------------------------------------------- + +project = 'Iranlowo' +copyright = '2019, Ruoho Ruosi , Olamilekan Wahab' +author = 'Ruoho Ruosi , Olamilekan Wahab' + +# The full version, including alpha/beta/rc tags +release = '0.1' + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = ['recommonmark'] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'alabaster' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..26780cb --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,10 @@ +.. Iranlowo documentation master file, created by + sphinx-quickstart on Sat Jul 6 09:15:49 2019. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +.. include:: ../README.rst + +.. toctree:: + :maxdepth: 1 + :caption: Contents: diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..922152e --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/requirements.txt b/requirements.txt index 5441c7f..aa90338 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +PyYAML bs4 configargparse torch diff --git a/src/iranlowo/corpus/__init__.py b/src/iranlowo/corpus/__init__.py index 4201d4e..16a9c96 100644 --- a/src/iranlowo/corpus/__init__.py +++ b/src/iranlowo/corpus/__init__.py @@ -1,2 +1,7 @@ -from .corpus import Corpus, DirectoryCorpus -from .loaders import OweLoader, YorubaBlogCorpus, BBCCorpus, BibeliCorpus \ No newline at end of file +from .corpus import ( + Corpus, + DirectoryCorpus, + get_corpus, + get_corpus_path, + download_corpus, +) diff --git a/src/iranlowo/corpus/corpus.py b/src/iranlowo/corpus/corpus.py index 81fa2ff..c4610e2 100644 --- a/src/iranlowo/corpus/corpus.py +++ b/src/iranlowo/corpus/corpus.py @@ -1,6 +1,7 @@ import gzip import os +import yaml from gensim import interfaces from gensim.corpora.csvcorpus import CsvCorpus from gensim.corpora.textcorpus import walk @@ -10,7 +11,16 @@ class Corpus(interfaces.CorpusABC): - def __init__(self, path=None, text=None, stream=False, fformat='txt', cformat=None, labels=False, preprocess=None): + def __init__( + self, + path=None, + text=None, + stream=False, + fformat="txt", + cformat=None, + labels=False, + preprocess=None, + ): """ Args: @@ -24,10 +34,16 @@ def __init__(self, path=None, text=None, stream=False, fformat='txt', cformat=No self.fformat = fformat self.cformat = cformat self.preprocess = preprocess - assert self.path or self.text, "You should pass either a path or text to read data from." + assert ( + self.path or self.text + ), "You should pass either a path or text to read data from." if not self.preprocess: self.preprocess = [normalize_diacritics_text] - self.data = self.read_file_filename_or_text(text=text) if text else self.read_file_filename_or_text() + self.data = ( + self.read_file_filename_or_text(text=text) + if text + else self.read_file_filename_or_text() + ) self.validate_format() def __iter__(self): @@ -69,12 +85,14 @@ def read_file_filename_or_text(self, f=None, text=None): text = open(path) elif self.fformat == "csv": text = CsvCorpus(path, self.labels) - elif self.fformat == 'gzip': + elif self.fformat == "gzip": text = gzip.open(path) else: text = self.path.seek(0) - text = text.read() if not self.stream else ''.join(list(self.streamfile(text))) + text = ( + text.read() if not self.stream else "".join(list(self.streamfile(text))) + ) return self.handle_preprocessing(text) if self.preprocess else text def handle_preprocessing(self, text): @@ -93,12 +111,16 @@ def validate_format(self): """ data = self.data if isinstance(data, list): - data = ''.join(data) + data = "".join(data) if not self.cformat and not is_text_nfc(data): raise TypeError("The corpus does not comply to the NFC corpus format") elif self.cformat == "owe": if not is_valid_owé_format(data): - raise TypeError("The corpus does not comply to the {0} corpus format".format(self.cformat)) + raise TypeError( + "The corpus does not comply to the {0} corpus format".format( + self.cformat + ) + ) else: return True @@ -118,15 +140,56 @@ def generate(self, size): class DirectoryCorpus(Corpus): def __init__(self, path, **kwargs): self.dir_path = path - self.depth = kwargs.get('min_depth', 0) + self.depth = kwargs.get("min_depth", 0) self.path = list(self.read_files()) super(DirectoryCorpus, self).__init__(path=self.path, **kwargs) def read_files(self): walked = list(walk(self.dir_path)) if not walked: - raise NotADirectoryError("'{}' is not a valid directory".format(self.dir_path)) + raise NotADirectoryError( + "'{}' is not a valid directory".format(self.dir_path) + ) for depth, dirpath, _, filenames in walked: if self.depth <= depth: for path in filenames: yield os.path.join(dirpath, path) + + +def get_corpus(name, niger_volta=False, **kwargs): + def file_or_dir(path, mode): + if mode == "single": + return Corpus(path=path, **kwargs) + else: + return DirectoryCorpus(path=path, **kwargs) + + with open(os.path.join(os.path.dirname(__file__), "corpus.yml"), "r") as stream: + data = yaml.safe_load(stream) + if niger_volta: + nvc = data.get("niger_volta") + if name not in nvc.keys(): + raise ValueError("Corpus {} does not exist".format(name)) + else: + if not os.environ.get("NIGER_VOLTA_CORPUS", None): + raise NotADirectoryError( + "NIGER_VOLTA_CORPUS environment variable not found. Please, clone the corpus repository from https://github.com/Niger-Volta-LTI/yoruba-text and set to NIGER_VOLTA_CORPUS to it's " + "path" + ) + path = os.path.join(os.environ["NIGER_VOLTA_CORPUS"], nvc[name]["path"]) + return file_or_dir(path, nvc[name]["mode"]) + else: + path = os.path.join(os.path.dirname(__file__), "corpus/{}".format(data["path"])) + return file_or_dir(path, data["mode"]) + + +def get_corpus_path(name): + with open(os.path.join(os.path.dirname(__file__), "corpus.yml"), "r") as stream: + data = yaml.safe_load(stream) + if name not in data.keys(): + raise ValueError("Corpus {} does not exist".format(name)) + else: + return os.path.join(os.path.dirname(__file__), data[name]) + + +def download_corpus(name, uri=None): + pass diff --git a/src/iranlowo/corpus/corpus.yml b/src/iranlowo/corpus/corpus.yml new file mode 100644 index 0000000..93dc8d9 --- /dev/null +++ b/src/iranlowo/corpus/corpus.yml @@ -0,0 +1,13 @@ +niger_volta: + yoruba_blog: + path: "TheYorubaBlog/theyorubablog_dot_com.txt" + mode: single + owe_yoruba: + path: " " + mode: dir + quran_mimo: + path: " " + mode: dir + asubiaro: + path: " " + mode: single \ No newline at end of file diff --git a/src/iranlowo/corpus/loaders.py b/src/iranlowo/corpus/loaders.py index 1314af6..e82a9ed 100644 --- a/src/iranlowo/corpus/loaders.py +++ b/src/iranlowo/corpus/loaders.py @@ -1,66 +1,7 @@ import os -from iranlowo.corpus import Corpus, DirectoryCorpus +from iranlowo.corpus import get_corpus -class BaseLoader(object): - def __init__(self, corpus_path): - self.corpus_path = corpus_path - yoruba_text_path = os.environ.get("YORUBA_TEXT_PATH", None) - if not yoruba_text_path: - raise NotADirectoryError( - "YORUBA_TEXT_PATH environment variable not found. Please, clone the corpus repository from https://github.com/Niger-Volta-LTI/yoruba-text and set to YORUBA_TEXT_PATH to it's " - "path") - else: - corpus_path = "{}/{}".format(yoruba_text_path, corpus_path) - self.path = corpus_path - - -class YorubaBlogCorpus(Corpus): - def __init__(self, path): - """ - - Args: - path: - """ - super(YorubaBlogCorpus, self).__init__(path=self.path, **kwargs) - - -class BBCCorpus(Corpus): - def __init__(self, path): - """ - - Args: - path: - """ - super(BBCCorpus, self).__init__(path=self.path, **kwargs) - super().__init__(path) - - -class BibeliCorpus(Corpus): - def __init__(self, path): - """ - - Args: - path: - """ - super(BibeliCorpus, self).__init__(path=self.path, **kwargs) - - -class en(BaseLoader, DirectoryCorpus): - def __init__(self): - BaseLoader.__init__(self, corpus_path="Owe/en") - DirectoryCorpus.__init__(self, path=self.path) - - -class yo(BaseLoader, DirectoryCorpus): - def __init__(self): - BaseLoader.__init__(self, corpus_path="Owe/yo") - DirectoryCorpus.__init__(self, path=self.path) - - -class OweLoader(object): - def __init__(self): - self.en = en() - self.yo = yo() - +def niger_volta_corpus(corpus_code): + return get_corpus(name=corpus_code, niger_volta=True) diff --git a/src/iranlowo/corpus/scrapper.py b/src/iranlowo/corpus/scrapper.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/iranlowo/tokenizer.py b/src/iranlowo/tokenizer.py new file mode 100644 index 0000000..d09560d --- /dev/null +++ b/src/iranlowo/tokenizer.py @@ -0,0 +1,84 @@ +import gensim + + +class Tokenizer(object): + def __init__(self, text, model=None, symbol=' ', func=None): + """ + + Args: + text: + model: + symbol: + func: + """ + self.text = text + self.symbol = symbol + self.func = func + self.model = model + + def ngram_tokenize(self): + pass + + def word_tokenize(self, symbol=None, map_entities=False): + if map_entities: + email, num, link, abb = "", "", "", "" + if not symbol: + tokens = gensim.utils.simple_tokenize(self.text) + else: + tokens = [x for x in self.text] + + def sentence_tokenize(self, min_words_to_split=10, min_words_in_utt=5): + output = [] + for line in self.text.splitlines(): + if self.symbol in line: + num_words = len(line.split()) + num_commas = line.count(self.symbol) + curr_comma_position = line.index(self.symbol) + num_words_ahead_of_curr_comma = len(line[0:curr_comma_position].split()) + + curr_line = line + while num_commas > 0: + if num_words < min_words_to_split: + # print(curr_line.strip()) + output.append(curr_line) + break + if num_words >= min_words_to_split: + if ( + num_words_ahead_of_curr_comma >= min_words_in_utt + and len(curr_line[curr_comma_position:].split()) + >= min_words_in_utt + ): + output.append(curr_line[0:curr_comma_position] + "\n") + + # update vars + curr_line = curr_line[curr_comma_position + 1:] + num_words = len(curr_line.split()) + num_commas = num_commas - 1 + if num_commas > 0: + curr_comma_position = curr_line.index(self.symbol) + num_words_ahead_of_curr_comma = len( + curr_line[0:curr_comma_position].split() + ) + else: + output.append(curr_line) + else: + # ignore too short comma (+= vs = on current comma position) + num_commas = num_commas - 1 + if num_commas > 0: # for say 3 commas + curr_comma_position += ( + curr_line[curr_comma_position + 1:].index(self.symbol) + + 1 + ) + num_words_ahead_of_curr_comma = len( + curr_line[0:curr_comma_position].split() + ) + else: + output.append(curr_line) + else: + output.append(curr_line) + else: + output.append(line) + return output + + def morph_tokenize(self): + pass diff --git a/tests/test_loaders.py b/tests/test_loaders.py index 7468a35..91f0264 100644 --- a/tests/test_loaders.py +++ b/tests/test_loaders.py @@ -1,12 +1,18 @@ import unittest -from iranlowo import corpus +from iranlowo.corpus import loaders, corpus class TestCoprusLoader(unittest.TestCase): - def setUp(self): - self.owe_loader = corpus.OweLoader + def test_load_yoruba_blog(self): + yb = loaders.niger_volta_corpus('yoruba_blog') + self.assertIsInstance(yb, corpus.Corpus) - def test_load_owe(self): + def test_load_owe_empty(self): with self.assertRaises(NotADirectoryError): - self.owe_loader() + loaders.niger_volta_corpus('owe_yoruba') + + def test_load_corpus_does_not_exist(self): + with self.assertRaises(ValueError): + loaders.niger_volta_corpus('owe') +