From 190dc15982579e97306f15da377954a695aaab4f Mon Sep 17 00:00:00 2001
From: Olamyy <olamyy53@gmail.com>
Date: Sat, 6 Jul 2019 09:49:40 +0100
Subject: [PATCH 1/9] Intial Documentation Commit

---
 README.md      |  65 -----------------------------
 README.rst     | 111 +++++++++++++++++++++++++++++++++++++++++++++++++
 docs/Makefile  |  20 +++++++++
 docs/conf.py   |  54 ++++++++++++++++++++++++
 docs/index.rst |  10 +++++
 docs/make.bat  |  35 ++++++++++++++++
 6 files changed, 230 insertions(+), 65 deletions(-)
 delete mode 100644 README.md
 create mode 100644 README.rst
 create mode 100644 docs/Makefile
 create mode 100644 docs/conf.py
 create mode 100644 docs/index.rst
 create mode 100644 docs/make.bat
diff --git a/README.md b/README.md
deleted file mode 100644
index feb283d..0000000
--- a/README.md
+++ /dev/null
@@ -1,65 +0,0 @@
-# Ìrànlọ́wọ́
-[![Build Status](https://travis-ci.org/Niger-Volta-LTI/iranlowo.svg?branch=master)](https://travis-ci.org/Niger-Volta-LTI/iranlowo)
-[![PyPI](https://img.shields.io/pypi/v/iranlowo.svg)](https://pypi.org/project/iranlowo)
-![PyPI - Python Version](https://img.shields.io/pypi/pyversions/iranlowo.svg)
-[![License](https://black.readthedocs.io/en/stable/_static/license.svg)](https://github.com/ruohoruotsi/iranlowo/blob/master/LICENSE)
-[![Style](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/ambv/black)
-
-Ìrànlọ́wọ́ is a set of utilities to analyze &amp; process Yorùbá text for NLP tasks. The focus is on *helping software developers* build large, clean text datasets for (further) diacritic restoration and machine translation tasks.
-
-## Features
-
-### ADR tools
-* [X] Strip all diacritics from word-types
-* [X] Verify that text is NFC or NFD
-* [X] Canonicalize a corpus (from MS Word or elsewhere) &rarr; NFC
-* [X] Split long sentences on certain characters like `;`,`:`, etc
-* [X] Automatically restore correct diacritics using a pre-trained model
-* [X] Find all variants of all word-type in a given corpus
-* [ ] Partially strip diacritics from word-types
-
-### Ready to use webpage scrapers
-* [X] Bíbélì Mímọ́
-* [X] Yoruba Bible - Bible Society of Nigeria
-* [ ] Yorùbá Blog
-* [ ] BBC Yorùbá
-
-### Corpus analysis tools
-* [X] Dataset character distribution
-* [X] Dataset ambuiguity statistics &rarr; Lexdif, etc for a given corpus
-* [ ] Dataset scoring (proximity to correctly diacritized text, LM perplexity, KL divergence)
-
-## Installation
-Obtainable from the [Python Package Index (PyPI)](https://pypi.org/project/iranlowo/) &rarr;  `pip install iranlowo`
-
-## Example
-
-* Show computing environment and installation process
-
-<img src="https://raw.githubusercontent.com/ruohoruotsi/iranlowo/master/docs/install.gif" width="700">
-
-* Diacritize a phrase
-```
-$ python
-Python 3.7.3 (default, Mar 27 2019, 16:54:48)
-[Clang 4.0.1 (tags/RELEASE_401/final)] :: Anaconda, Inc. on darwin
-Type "help", "copyright", "credits" or "license" for more information.
->>> import iranlowo.adr as ránlọ
->>> ránlọ.diacritize_text("lootoo ni pe ojo gbogbo ni ti ole")
-PRED AVG SCORE: -0.0037, PRED PPL: 1.0037
-'lóòtóọ́ ni pé ọjọ́ gbogbo ni ti olè' 
-```
-
-* Diacritize phrases, note we use `ipython` only because it renders nicer, easy-to-read text-colours in the terminal!
-
-<img src="https://raw.githubusercontent.com/ruohoruotsi/iranlowo/master/docs/adr.gif" width="700">
-
-## Disclaimer
-
-This is beta software, if you pass the diacritizer [out-of-domain text](https://www.quora.com/What-is-in-domain-out-domain-and-open-domain-data), English, pidgin or any other non-Yorùbá text, you will experience very marvelous, black-box results. 
-
-Since this a work-in-progress and we are steadily improving, if you encounter any problems with correctness or performance, please submit [pull-requests](https://github.com/ruohoruotsi/iranlowo/pulls) with corrections or file an [issue](https://github.com/ruohoruotsi/iranlowo/issues).
-
-## License
- 
-This project is licensed under the [MIT License](https://opensource.org/licenses/MIT).
\ No newline at end of file
diff --git a/README.rst b/README.rst
new file mode 100644
index 0000000..a67b8c1
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,111 @@
+.. role:: raw-html-m2r(raw)
+   :format: html
+
+
+Ìrànlọ́wọ́
+========
+
+
+.. image:: https://travis-ci.org/Niger-Volta-LTI/iranlowo.svg?branch=master
+   :target: https://travis-ci.org/Niger-Volta-LTI/iranlowo
+   :alt: Build Status
+
+
+.. image:: https://img.shields.io/pypi/v/iranlowo.svg
+   :target: https://pypi.org/project/iranlowo
+   :alt: PyPI
+
+
+.. image:: https://img.shields.io/pypi/pyversions/iranlowo.svg
+   :target: https://img.shields.io/pypi/pyversions/iranlowo.svg
+   :alt: PyPI - Python Version
+
+
+.. image:: https://black.readthedocs.io/en/stable/_static/license.svg
+   :target: https://github.com/ruohoruotsi/iranlowo/blob/master/LICENSE
+   :alt: License
+
+
+.. image:: https://img.shields.io/badge/code%20style-black-000000.svg
+   :target: https://github.com/ambv/black
+   :alt: Style
+
+
+Ìrànlọ́wọ́ is a set of utilities to analyze &amp; process Yorùbá text for NLP tasks. The focus is on *helping software developers* build large, clean text datasets for (further) diacritic restoration and machine translation tasks.
+
+Features
+--------
+
+ADR tools
+^^^^^^^^^
+
+
+* [X] Strip all diacritics from word-types
+* [X] Verify that text is NFC or NFD
+* [X] Canonicalize a corpus (from MS Word or elsewhere) &rarr; NFC
+* [X] Split long sentences on certain characters like ``;``\ ,\ ``:``\ , etc
+* [X] Automatically restore correct diacritics using a pre-trained model
+* [X] Find all variants of all word-type in a given corpus
+* [ ] Partially strip diacritics from word-types
+
+Ready to use webpage scrapers
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+
+* [X] Bíbélì Mímọ́
+* [X] Yoruba Bible - Bible Society of Nigeria
+* [ ] Yorùbá Blog
+* [ ] BBC Yorùbá
+
+Corpus analysis tools
+^^^^^^^^^^^^^^^^^^^^^
+
+
+* [X] Dataset character distribution
+* [X] Dataset ambuiguity statistics &rarr; Lexdif, etc for a given corpus
+* [ ] Dataset scoring (proximity to correctly diacritized text, LM perplexity, KL divergence)
+
+Installation
+------------
+
+Obtainable from the `Python Package Index (PyPI) <https://pypi.org/project/iranlowo/>`_ &rarr;  ``pip install iranlowo``
+
+Example
+-------
+
+
+* Show computing environment and installation process
+
+:raw-html-m2r:`<img src="https://raw.githubusercontent.com/ruohoruotsi/iranlowo/master/docs/install.gif" width="700">`
+
+
+* 
+  Diacritize a phrase
+
+  .. code-block::
+
+     $ python
+     Python 3.7.3 (default, Mar 27 2019, 16:54:48)
+     [Clang 4.0.1 (tags/RELEASE_401/final)] :: Anaconda, Inc. on darwin
+     Type "help", "copyright", "credits" or "license" for more information.
+     >>> import iranlowo.adr as ránlọ
+     >>> ránlọ.diacritize_text("lootoo ni pe ojo gbogbo ni ti ole")
+     PRED AVG SCORE: -0.0037, PRED PPL: 1.0037
+     'lóòtóọ́ ni pé ọjọ́ gbogbo ni ti olè'
+
+* 
+  Diacritize phrases, note we use ``ipython`` only because it renders nicer, easy-to-read text-colours in the terminal!
+
+:raw-html-m2r:`<img src="https://raw.githubusercontent.com/ruohoruotsi/iranlowo/master/docs/adr.gif" width="700">`
+
+Disclaimer
+----------
+
+This is beta software, if you pass the diacritizer `out-of-domain text <https://www.quora.com/What-is-in-domain-out-domain-and-open-domain-data>`_\ , English, pidgin or any other non-Yorùbá text, you will experience very marvelous, black-box results. 
+
+Since this a work-in-progress and we are steadily improving, if you encounter any problems with correctness or performance, please submit `pull-requests <https://github.com/ruohoruotsi/iranlowo/pulls>`_ with corrections or file an `issue <https://github.com/ruohoruotsi/iranlowo/issues>`_.
+
+License
+-------
+
+This project is licensed under the `MIT License <https://opensource.org/licenses/MIT>`_.
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..d4bb2cb
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 0000000..4f4c978
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,54 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# http://www.sphinx-doc.org/en/master/config
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+
+# -- Project information -----------------------------------------------------
+
+project = 'Iranlowo'
+copyright = '2019, Ruoho Ruosi , Olamilekan Wahab'
+author = 'Ruoho Ruosi , Olamilekan Wahab'
+
+# The full version, including alpha/beta/rc tags
+release = '0.1'
+
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = ['recommonmark']
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'alabaster'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 0000000..26780cb
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,10 @@
+.. Iranlowo documentation master file, created by
+   sphinx-quickstart on Sat Jul  6 09:15:49 2019.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+.. include:: ../README.rst
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Contents:
diff --git a/docs/make.bat b/docs/make.bat
new file mode 100644
index 0000000..922152e
--- /dev/null
+++ b/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd

From ab0c612ec4f0e8f2d4fddcb1166030d38b8c675b Mon Sep 17 00:00:00 2001
From: Olamyy <olamyy53@gmail.com>
Date: Fri, 9 Aug 2019 13:25:02 +0100
Subject: [PATCH 2/9] README.md

---
 src/iranlowo/interfaces.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 src/iranlowo/interfaces.py

diff --git a/src/iranlowo/interfaces.py b/src/iranlowo/interfaces.py
new file mode 100644
index 0000000..5ba518e
--- /dev/null
+++ b/src/iranlowo/interfaces.py
@@ -0,0 +1,14 @@
+import scrapy
+
+
+class Scrapper(scrapy.Spider):
+    """
+    Interface for scrapping data from :mod:`iranlowo.scrapper`
+    """
+
+    def __init__(self, name, urls, **kwargs):
+        super(Scrapper, self).__init__(name, **kwargs)
+
+    def parse(self, response):
+        pass
+

From 17b8e18a0ee89dadbb6a1f702c0d7569d4255f51 Mon Sep 17 00:00:00 2001
From: Olamyy <olamyy53@gmail.com>
Date: Thu, 14 Nov 2019 10:33:15 +0100
Subject: [PATCH 3/9] Introducd ngram tokenizer

---
 src/iranlowo/tokenizer.py | 84 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)
 create mode 100644 src/iranlowo/tokenizer.py

diff --git a/src/iranlowo/tokenizer.py b/src/iranlowo/tokenizer.py
new file mode 100644
index 0000000..d09560d
--- /dev/null
+++ b/src/iranlowo/tokenizer.py
@@ -0,0 +1,84 @@
+import gensim
+
+
+class Tokenizer(object):
+    def __init__(self, text, model=None, symbol=' ', func=None):
+        """
+
+        Args:
+            text:
+            model:
+            symbol:
+            func:
+        """
+        self.text = text
+        self.symbol = symbol
+        self.func = func
+        self.model = model
+
+    def ngram_tokenize(self):
+        pass
+
+    def word_tokenize(self, symbol=None, map_entities=False):
+        if map_entities:
+            email, num, link, abb = "<EMAIL>", "<NUM>", "<LINK>", "<ABB>"
+        if not symbol:
+            tokens = gensim.utils.simple_tokenize(self.text)
+        else:
+            tokens = [x for x in self.text]
+
+    def sentence_tokenize(self, min_words_to_split=10, min_words_in_utt=5):
+        output = []
+        for line in self.text.splitlines():
+            if self.symbol in line:
+                num_words = len(line.split())
+                num_commas = line.count(self.symbol)
+                curr_comma_position = line.index(self.symbol)
+                num_words_ahead_of_curr_comma = len(line[0:curr_comma_position].split())
+
+                curr_line = line
+                while num_commas > 0:
+                    if num_words < min_words_to_split:
+                        # print(curr_line.strip())
+                        output.append(curr_line)
+                        break
+                    if num_words >= min_words_to_split:
+                        if (
+                                num_words_ahead_of_curr_comma >= min_words_in_utt
+                                and len(curr_line[curr_comma_position:].split())
+                                >= min_words_in_utt
+                        ):
+                            output.append(curr_line[0:curr_comma_position] + "\n")
+
+                            # update vars
+                            curr_line = curr_line[curr_comma_position + 1:]
+                            num_words = len(curr_line.split())
+                            num_commas = num_commas - 1
+                            if num_commas > 0:
+                                curr_comma_position = curr_line.index(self.symbol)
+                                num_words_ahead_of_curr_comma = len(
+                                    curr_line[0:curr_comma_position].split()
+                                )
+                            else:
+                                output.append(curr_line)
+                        else:
+                            # ignore too short comma (+= vs = on current comma position)
+                            num_commas = num_commas - 1
+                            if num_commas > 0:  # for say 3 commas
+                                curr_comma_position += (
+                                        curr_line[curr_comma_position + 1:].index(self.symbol)
+                                        + 1
+                                )
+                                num_words_ahead_of_curr_comma = len(
+                                    curr_line[0:curr_comma_position].split()
+                                )
+                            else:
+                                output.append(curr_line)
+                    else:
+                        output.append(curr_line)
+            else:
+                output.append(line)
+        return output
+
+    def morph_tokenize(self):
+        pass

From 1feb123988a8afac3ac53c7acfb72df862c4bc18 Mon Sep 17 00:00:00 2001
From: Olamyy <olamyy53@gmail.com>
Date: Sun, 15 Dec 2019 20:31:25 +0100
Subject: [PATCH 4/9] Reworking corpus

---
 src/iranlowo/corpus/{scrapper.py => corpus.yml} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename src/iranlowo/corpus/{scrapper.py => corpus.yml} (100%)

diff --git a/src/iranlowo/corpus/scrapper.py b/src/iranlowo/corpus/corpus.yml
similarity index 100%
rename from src/iranlowo/corpus/scrapper.py
rename to src/iranlowo/corpus/corpus.yml

From 5cc9ae583faffc2386c9799c82a9d4f378a701ce Mon Sep 17 00:00:00 2001
From: Olamyy <olamyy53@gmail.com>
Date: Sun, 15 Dec 2019 22:58:21 +0100
Subject: [PATCH 5/9] Cleaned the corpus module. The loader is now a function.
 Added tests.

---
 requirements.txt                |  1 +
 src/iranlowo/corpus/__init__.py |  4 +-
 src/iranlowo/corpus/corpus.py   | 36 +++++++++++++++++
 src/iranlowo/corpus/corpus.yml  | 13 ++++++
 src/iranlowo/corpus/loaders.py  | 71 +++++----------------------------
 tests/test_loaders.py           | 16 +++++---
 6 files changed, 73 insertions(+), 68 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 5441c7f..aa90338 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
+PyYAML
 bs4
 configargparse
 torch
diff --git a/src/iranlowo/corpus/__init__.py b/src/iranlowo/corpus/__init__.py
index 4201d4e..9af9807 100644
--- a/src/iranlowo/corpus/__init__.py
+++ b/src/iranlowo/corpus/__init__.py
@@ -1,2 +1,2 @@
-from .corpus import Corpus, DirectoryCorpus
-from .loaders import OweLoader, YorubaBlogCorpus, BBCCorpus, BibeliCorpus
\ No newline at end of file
+from .corpus import Corpus, DirectoryCorpus, get_corpus, get_corpus_path, download_corpus
+
diff --git a/src/iranlowo/corpus/corpus.py b/src/iranlowo/corpus/corpus.py
index 81fa2ff..01b99ca 100644
--- a/src/iranlowo/corpus/corpus.py
+++ b/src/iranlowo/corpus/corpus.py
@@ -1,6 +1,7 @@
 import gzip
 import os
 
+import yaml
 from gensim import interfaces
 from gensim.corpora.csvcorpus import CsvCorpus
 from gensim.corpora.textcorpus import walk
@@ -130,3 +131,38 @@ def read_files(self):
             if self.depth <= depth:
                 for path in filenames:
                     yield os.path.join(dirpath, path)
+
+
+def get_corpus(name, niger_volta=False, **kwargs):
+    def file_or_dir(path, mode):
+        if mode == "single":
+            return Corpus(path=path, **kwargs)
+        else:
+            return DirectoryCorpus(path=path, **kwargs)
+
+    with open(os.path.join(os.path.dirname(__file__), 'corpus.yml'), 'r') as stream:
+        data = yaml.safe_load(stream)
+    if niger_volta:
+        nvc = data.get('niger_volta')
+        if name not in nvc.keys():
+            raise ValueError("Corpus {} does not exist".format(name))
+        else:
+            path = os.path.join(os.environ['NIGER_VOLTA_CORPUS'], nvc[name]['path'])
+            return file_or_dir(path, nvc[name]['mode'])
+    else:
+        path = os.path.join(os.path.dirname(__file__), 'corpus/{}'.format(data['path']))
+        return file_or_dir(path, data['mode'])
+
+
+def get_corpus_path(name):
+    with open(os.path.join(os.path.dirname(__file__), 'corpus.yml'), 'r') as stream:
+        data = yaml.safe_load(stream)
+        if name not in data.keys():
+            raise ValueError("Corpus {} does not exist".format(name))
+        else:
+            return os.path.join(os.path.dirname(__file__), data[name])
+
+
+def download_corpus(name, uri=None):
+    pass
+
diff --git a/src/iranlowo/corpus/corpus.yml b/src/iranlowo/corpus/corpus.yml
index e69de29..93dc8d9 100644
--- a/src/iranlowo/corpus/corpus.yml
+++ b/src/iranlowo/corpus/corpus.yml
@@ -0,0 +1,13 @@
+niger_volta:
+      yoruba_blog:
+          path: "TheYorubaBlog/theyorubablog_dot_com.txt"
+          mode: single
+      owe_yoruba:
+          path: " "
+          mode: dir
+      quran_mimo:
+          path: " "
+          mode: dir
+      asubiaro:
+          path: " "
+          mode: single
\ No newline at end of file
diff --git a/src/iranlowo/corpus/loaders.py b/src/iranlowo/corpus/loaders.py
index 1314af6..2d3e17d 100644
--- a/src/iranlowo/corpus/loaders.py
+++ b/src/iranlowo/corpus/loaders.py
@@ -1,66 +1,15 @@
 import os
 
-from iranlowo.corpus import Corpus, DirectoryCorpus
+from iranlowo.corpus import get_corpus
 
+os.environ['NIGER_VOLTA_CORPUS'] = "/Users/Olamilekan/Desktop/Machine Learning/OpenSource/yoruba-text"
 
-class BaseLoader(object):
-    def __init__(self, corpus_path):
-        self.corpus_path = corpus_path
-        yoruba_text_path = os.environ.get("YORUBA_TEXT_PATH", None)
-        if not yoruba_text_path:
-            raise NotADirectoryError(
-                "YORUBA_TEXT_PATH environment variable not found. Please, clone the corpus repository from https://github.com/Niger-Volta-LTI/yoruba-text and set to YORUBA_TEXT_PATH to it's "
-                "path")
-        else:
-            corpus_path = "{}/{}".format(yoruba_text_path, corpus_path)
-            self.path = corpus_path
-
-
-class YorubaBlogCorpus(Corpus):
-    def __init__(self, path):
-        """
-
-        Args:
-            path:
-        """
-        super(YorubaBlogCorpus, self).__init__(path=self.path, **kwargs)
-
-
-class BBCCorpus(Corpus):
-    def __init__(self, path):
-        """
-
-        Args:
-            path:
-        """
-        super(BBCCorpus, self).__init__(path=self.path, **kwargs)
-        super().__init__(path)
-
-
-class BibeliCorpus(Corpus):
-    def __init__(self, path):
-        """
-
-        Args:
-            path:
-        """
-        super(BibeliCorpus, self).__init__(path=self.path, **kwargs)
-
-
-class en(BaseLoader, DirectoryCorpus):
-    def __init__(self):
-        BaseLoader.__init__(self, corpus_path="Owe/en")
-        DirectoryCorpus.__init__(self, path=self.path)
-
-
-class yo(BaseLoader, DirectoryCorpus):
-    def __init__(self):
-        BaseLoader.__init__(self, corpus_path="Owe/yo")
-        DirectoryCorpus.__init__(self, path=self.path)
-
-
-class OweLoader(object):
-    def __init__(self):
-        self.en = en()
-        self.yo = yo()
 
+def niger_volta_corpus(corpus_code):
+    nvc_path = os.environ.get("NIGER_VOLTA_CORPUS", None)
+    if not nvc_path:
+        raise NotADirectoryError(
+            "NIGER_VOLTA_CORPUS environment variable not found. Please, clone the corpus repository from https://github.com/Niger-Volta-LTI/yoruba-text and set to NIGER_VOLTA_CORPUS to it's "
+            "path")
+    else:
+        return get_corpus(name=corpus_code, niger_volta=True)
diff --git a/tests/test_loaders.py b/tests/test_loaders.py
index 7468a35..91f0264 100644
--- a/tests/test_loaders.py
+++ b/tests/test_loaders.py
@@ -1,12 +1,18 @@
 import unittest
 
-from iranlowo import corpus
+from iranlowo.corpus import loaders, corpus
 
 
 class TestCoprusLoader(unittest.TestCase):
-    def setUp(self):
-        self.owe_loader = corpus.OweLoader
+    def test_load_yoruba_blog(self):
+        yb = loaders.niger_volta_corpus('yoruba_blog')
+        self.assertIsInstance(yb, corpus.Corpus)
 
-    def test_load_owe(self):
+    def test_load_owe_empty(self):
         with self.assertRaises(NotADirectoryError):
-            self.owe_loader()
+            loaders.niger_volta_corpus('owe_yoruba')
+
+    def test_load_corpus_does_not_exist(self):
+        with self.assertRaises(ValueError):
+            loaders.niger_volta_corpus('owe')
+

From 82608b69231277495a254ed16fd51f75ef75e463 Mon Sep 17 00:00:00 2001
From: Olamyy <olamyy53@gmail.com>
Date: Sun, 15 Dec 2019 23:05:05 +0100
Subject: [PATCH 6/9] Introduced black for formatting

---
 src/iranlowo/corpus/__init__.py |  9 ++++--
 src/iranlowo/corpus/corpus.py   | 57 +++++++++++++++++++++++----------
 src/iranlowo/corpus/loaders.py  |  7 ++--
 3 files changed, 52 insertions(+), 21 deletions(-)

diff --git a/src/iranlowo/corpus/__init__.py b/src/iranlowo/corpus/__init__.py
index 9af9807..16a9c96 100644
--- a/src/iranlowo/corpus/__init__.py
+++ b/src/iranlowo/corpus/__init__.py
@@ -1,2 +1,7 @@
-from .corpus import Corpus, DirectoryCorpus, get_corpus, get_corpus_path, download_corpus
-
+from .corpus import (
+    Corpus,
+    DirectoryCorpus,
+    get_corpus,
+    get_corpus_path,
+    download_corpus,
+)
diff --git a/src/iranlowo/corpus/corpus.py b/src/iranlowo/corpus/corpus.py
index 01b99ca..e85679c 100644
--- a/src/iranlowo/corpus/corpus.py
+++ b/src/iranlowo/corpus/corpus.py
@@ -11,7 +11,16 @@
 
 
 class Corpus(interfaces.CorpusABC):
-    def __init__(self, path=None, text=None, stream=False, fformat='txt', cformat=None, labels=False, preprocess=None):
+    def __init__(
+        self,
+        path=None,
+        text=None,
+        stream=False,
+        fformat="txt",
+        cformat=None,
+        labels=False,
+        preprocess=None,
+    ):
         """
 
         Args:
@@ -19,16 +28,23 @@ def __init__(self, path=None, text=None, stream=False, fformat='txt', cformat=No
             text:
         """
         self.path = path
+        print(self.path)
         self.text = text
         self.labels = labels
         self.stream = stream
         self.fformat = fformat
         self.cformat = cformat
         self.preprocess = preprocess
-        assert self.path or self.text, "You should pass either a path or text to read data from."
+        assert (
+            self.path or self.text
+        ), "You should pass either a path or text to read data from."
         if not self.preprocess:
             self.preprocess = [normalize_diacritics_text]
-        self.data = self.read_file_filename_or_text(text=text) if text else self.read_file_filename_or_text()
+        self.data = (
+            self.read_file_filename_or_text(text=text)
+            if text
+            else self.read_file_filename_or_text()
+        )
         self.validate_format()
 
     def __iter__(self):
@@ -70,12 +86,14 @@ def read_file_filename_or_text(self, f=None, text=None):
                     text = open(path)
                 elif self.fformat == "csv":
                     text = CsvCorpus(path, self.labels)
-                elif self.fformat == 'gzip':
+                elif self.fformat == "gzip":
                     text = gzip.open(path)
             else:
                 text = self.path.seek(0)
 
-            text = text.read() if not self.stream else ''.join(list(self.streamfile(text)))
+            text = (
+                text.read() if not self.stream else "".join(list(self.streamfile(text)))
+            )
             return self.handle_preprocessing(text) if self.preprocess else text
 
     def handle_preprocessing(self, text):
@@ -94,12 +112,16 @@ def validate_format(self):
         """
         data = self.data
         if isinstance(data, list):
-            data = ''.join(data)
+            data = "".join(data)
         if not self.cformat and not is_text_nfc(data):
             raise TypeError("The corpus does not comply to the NFC corpus format")
         elif self.cformat == "owe":
             if not is_valid_owé_format(data):
-                raise TypeError("The corpus does not comply to the {0} corpus format".format(self.cformat))
+                raise TypeError(
+                    "The corpus does not comply to the {0} corpus format".format(
+                        self.cformat
+                    )
+                )
             else:
                 return True
 
@@ -119,14 +141,16 @@ def generate(self, size):
 class DirectoryCorpus(Corpus):
     def __init__(self, path, **kwargs):
         self.dir_path = path
-        self.depth = kwargs.get('min_depth', 0)
+        self.depth = kwargs.get("min_depth", 0)
         self.path = list(self.read_files())
         super(DirectoryCorpus, self).__init__(path=self.path, **kwargs)
 
     def read_files(self):
         walked = list(walk(self.dir_path))
         if not walked:
-            raise NotADirectoryError("'{}' is not a valid directory".format(self.dir_path))
+            raise NotADirectoryError(
+                "'{}' is not a valid directory".format(self.dir_path)
+            )
         for depth, dirpath, _, filenames in walked:
             if self.depth <= depth:
                 for path in filenames:
@@ -140,22 +164,22 @@ def file_or_dir(path, mode):
         else:
             return DirectoryCorpus(path=path, **kwargs)
 
-    with open(os.path.join(os.path.dirname(__file__), 'corpus.yml'), 'r') as stream:
+    with open(os.path.join(os.path.dirname(__file__), "corpus.yml"), "r") as stream:
         data = yaml.safe_load(stream)
     if niger_volta:
-        nvc = data.get('niger_volta')
+        nvc = data.get("niger_volta")
         if name not in nvc.keys():
             raise ValueError("Corpus {} does not exist".format(name))
         else:
-            path = os.path.join(os.environ['NIGER_VOLTA_CORPUS'], nvc[name]['path'])
-            return file_or_dir(path, nvc[name]['mode'])
+            path = os.path.join(os.environ["NIGER_VOLTA_CORPUS"], nvc[name]["path"])
+            return file_or_dir(path, nvc[name]["mode"])
     else:
-        path = os.path.join(os.path.dirname(__file__), 'corpus/{}'.format(data['path']))
-        return file_or_dir(path, data['mode'])
+        path = os.path.join(os.path.dirname(__file__), "corpus/{}".format(data["path"]))
+        return file_or_dir(path, data["mode"])
 
 
 def get_corpus_path(name):
-    with open(os.path.join(os.path.dirname(__file__), 'corpus.yml'), 'r') as stream:
+    with open(os.path.join(os.path.dirname(__file__), "corpus.yml"), "r") as stream:
         data = yaml.safe_load(stream)
         if name not in data.keys():
             raise ValueError("Corpus {} does not exist".format(name))
@@ -165,4 +189,3 @@ def get_corpus_path(name):
 
 def download_corpus(name, uri=None):
     pass
-
diff --git a/src/iranlowo/corpus/loaders.py b/src/iranlowo/corpus/loaders.py
index 2d3e17d..1565ec1 100644
--- a/src/iranlowo/corpus/loaders.py
+++ b/src/iranlowo/corpus/loaders.py
@@ -2,7 +2,9 @@
 
 from iranlowo.corpus import get_corpus
 
-os.environ['NIGER_VOLTA_CORPUS'] = "/Users/Olamilekan/Desktop/Machine Learning/OpenSource/yoruba-text"
+os.environ[
+    "NIGER_VOLTA_CORPUS"
+] = "/Users/Olamilekan/Desktop/Machine Learning/OpenSource/yoruba-text"
 
 
 def niger_volta_corpus(corpus_code):
@@ -10,6 +12,7 @@ def niger_volta_corpus(corpus_code):
     if not nvc_path:
         raise NotADirectoryError(
             "NIGER_VOLTA_CORPUS environment variable not found. Please, clone the corpus repository from https://github.com/Niger-Volta-LTI/yoruba-text and set to NIGER_VOLTA_CORPUS to it's "
-            "path")
+            "path"
+        )
     else:
         return get_corpus(name=corpus_code, niger_volta=True)

From e3cace6ba4965917ace0b46e51110ccf40f3d7f5 Mon Sep 17 00:00:00 2001
From: Olamyy <olamyy53@gmail.com>
Date: Sun, 15 Dec 2019 23:14:16 +0100
Subject: [PATCH 7/9] Removed print statements

---
 src/iranlowo/corpus/corpus.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/iranlowo/corpus/corpus.py b/src/iranlowo/corpus/corpus.py
index e85679c..0bdf3bf 100644
--- a/src/iranlowo/corpus/corpus.py
+++ b/src/iranlowo/corpus/corpus.py
@@ -28,7 +28,6 @@ def __init__(
             text:
         """
         self.path = path
-        print(self.path)
         self.text = text
         self.labels = labels
         self.stream = stream

From 5ee7a63d3324002f542e7921eb1fe7defcbe0748 Mon Sep 17 00:00:00 2001
From: Olamyy <olamyy53@gmail.com>
Date: Sun, 15 Dec 2019 23:32:04 +0100
Subject: [PATCH 8/9] Added NIGER_VOLTA_CORPUS folder check.

---
 src/iranlowo/corpus/corpus.py  | 5 +++++
 src/iranlowo/corpus/loaders.py | 9 +--------
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/iranlowo/corpus/corpus.py b/src/iranlowo/corpus/corpus.py
index 0bdf3bf..c4610e2 100644
--- a/src/iranlowo/corpus/corpus.py
+++ b/src/iranlowo/corpus/corpus.py
@@ -170,6 +170,11 @@ def file_or_dir(path, mode):
         if name not in nvc.keys():
             raise ValueError("Corpus {} does not exist".format(name))
         else:
+            if not os.environ.get("NIGER_VOLTA_CORPUS", None):
+                raise NotADirectoryError(
+                    "NIGER_VOLTA_CORPUS environment variable not found. Please, clone the corpus repository from https://github.com/Niger-Volta-LTI/yoruba-text and set to NIGER_VOLTA_CORPUS to it's "
+                    "path"
+                )
             path = os.path.join(os.environ["NIGER_VOLTA_CORPUS"], nvc[name]["path"])
             return file_or_dir(path, nvc[name]["mode"])
     else:
diff --git a/src/iranlowo/corpus/loaders.py b/src/iranlowo/corpus/loaders.py
index 1565ec1..07e60a8 100644
--- a/src/iranlowo/corpus/loaders.py
+++ b/src/iranlowo/corpus/loaders.py
@@ -8,11 +8,4 @@
 
 
 def niger_volta_corpus(corpus_code):
-    nvc_path = os.environ.get("NIGER_VOLTA_CORPUS", None)
-    if not nvc_path:
-        raise NotADirectoryError(
-            "NIGER_VOLTA_CORPUS environment variable not found. Please, clone the corpus repository from https://github.com/Niger-Volta-LTI/yoruba-text and set to NIGER_VOLTA_CORPUS to it's "
-            "path"
-        )
-    else:
-        return get_corpus(name=corpus_code, niger_volta=True)
+    return get_corpus(name=corpus_code, niger_volta=True)

From 7db0ec49d6d7728ae1fad5ddc5102c934ed336b9 Mon Sep 17 00:00:00 2001
From: Olamyy <olamyy53@gmail.com>
Date: Sun, 15 Dec 2019 23:50:38 +0100
Subject: [PATCH 9/9] Fixed failing tests

---
 src/iranlowo/corpus/loaders.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/iranlowo/corpus/loaders.py b/src/iranlowo/corpus/loaders.py
index 07e60a8..e82a9ed 100644
--- a/src/iranlowo/corpus/loaders.py
+++ b/src/iranlowo/corpus/loaders.py
@@ -2,10 +2,6 @@
 
 from iranlowo.corpus import get_corpus
 
-os.environ[
-    "NIGER_VOLTA_CORPUS"
-] = "/Users/Olamilekan/Desktop/Machine Learning/OpenSource/yoruba-text"
-
 
 def niger_volta_corpus(corpus_code):
     return get_corpus(name=corpus_code, niger_volta=True)