diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..d4bb2cb
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 0000000..4f4c978
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,54 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# http://www.sphinx-doc.org/en/master/config
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+
+# -- Project information -----------------------------------------------------
+
+project = 'Iranlowo'
+copyright = '2019, Ruoho Ruosi , Olamilekan Wahab'
+author = 'Ruoho Ruosi , Olamilekan Wahab'
+
+# The full version, including alpha/beta/rc tags
+release = '0.1'
+
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = ['recommonmark']
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'alabaster'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 0000000..26780cb
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,10 @@
+.. Iranlowo documentation master file, created by
+   sphinx-quickstart on Sat Jul  6 09:15:49 2019.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+.. include:: ../README.rst
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Contents:
diff --git a/docs/make.bat b/docs/make.bat
new file mode 100644
index 0000000..922152e
--- /dev/null
+++ b/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/requirements.txt b/requirements.txt
index 5441c7f..aa90338 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
+PyYAML
 bs4
 configargparse
 torch
diff --git a/src/iranlowo/corpus/__init__.py b/src/iranlowo/corpus/__init__.py
index 4201d4e..16a9c96 100644
--- a/src/iranlowo/corpus/__init__.py
+++ b/src/iranlowo/corpus/__init__.py
@@ -1,2 +1,7 @@
-from .corpus import Corpus, DirectoryCorpus
-from .loaders import OweLoader, YorubaBlogCorpus, BBCCorpus, BibeliCorpus
\ No newline at end of file
+from .corpus import (
+    Corpus,
+    DirectoryCorpus,
+    get_corpus,
+    get_corpus_path,
+    download_corpus,
+)
diff --git a/src/iranlowo/corpus/corpus.py b/src/iranlowo/corpus/corpus.py
index 81fa2ff..c4610e2 100644
--- a/src/iranlowo/corpus/corpus.py
+++ b/src/iranlowo/corpus/corpus.py
@@ -1,6 +1,7 @@
 import gzip
 import os
 
+import yaml
 from gensim import interfaces
 from gensim.corpora.csvcorpus import CsvCorpus
 from gensim.corpora.textcorpus import walk
@@ -10,7 +11,16 @@
 
 
 class Corpus(interfaces.CorpusABC):
-    def __init__(self, path=None, text=None, stream=False, fformat='txt', cformat=None, labels=False, preprocess=None):
+    def __init__(
+        self,
+        path=None,
+        text=None,
+        stream=False,
+        fformat="txt",
+        cformat=None,
+        labels=False,
+        preprocess=None,
+    ):
         """
 
         Args:
@@ -24,10 +34,16 @@ def __init__(self, path=None, text=None, stream=False, fformat='txt', cformat=No
         self.fformat = fformat
         self.cformat = cformat
         self.preprocess = preprocess
-        assert self.path or self.text, "You should pass either a path or text to read data from."
+        assert (
+            self.path or self.text
+        ), "You should pass either a path or text to read data from."
         if not self.preprocess:
             self.preprocess = [normalize_diacritics_text]
-        self.data = self.read_file_filename_or_text(text=text) if text else self.read_file_filename_or_text()
+        self.data = (
+            self.read_file_filename_or_text(text=text)
+            if text
+            else self.read_file_filename_or_text()
+        )
         self.validate_format()
 
     def __iter__(self):
@@ -69,12 +85,14 @@ def read_file_filename_or_text(self, f=None, text=None):
                     text = open(path)
                 elif self.fformat == "csv":
                     text = CsvCorpus(path, self.labels)
-                elif self.fformat == 'gzip':
+                elif self.fformat == "gzip":
                     text = gzip.open(path)
             else:
                 text = self.path.seek(0)
 
-            text = text.read() if not self.stream else ''.join(list(self.streamfile(text)))
+            text = (
+                text.read() if not self.stream else "".join(list(self.streamfile(text)))
+            )
             return self.handle_preprocessing(text) if self.preprocess else text
 
     def handle_preprocessing(self, text):
@@ -93,12 +111,16 @@ def validate_format(self):
         """
         data = self.data
         if isinstance(data, list):
-            data = ''.join(data)
+            data = "".join(data)
         if not self.cformat and not is_text_nfc(data):
             raise TypeError("The corpus does not comply to the NFC corpus format")
         elif self.cformat == "owe":
             if not is_valid_owé_format(data):
-                raise TypeError("The corpus does not comply to the {0} corpus format".format(self.cformat))
+                raise TypeError(
+                    "The corpus does not comply to the {0} corpus format".format(
+                        self.cformat
+                    )
+                )
             else:
                 return True
 
@@ -118,15 +140,56 @@ def generate(self, size):
 class DirectoryCorpus(Corpus):
     def __init__(self, path, **kwargs):
         self.dir_path = path
-        self.depth = kwargs.get('min_depth', 0)
+        self.depth = kwargs.get("min_depth", 0)
         self.path = list(self.read_files())
         super(DirectoryCorpus, self).__init__(path=self.path, **kwargs)
 
     def read_files(self):
         walked = list(walk(self.dir_path))
         if not walked:
-            raise NotADirectoryError("'{}' is not a valid directory".format(self.dir_path))
+            raise NotADirectoryError(
+                "'{}' is not a valid directory".format(self.dir_path)
+            )
         for depth, dirpath, _, filenames in walked:
             if self.depth <= depth:
                 for path in filenames:
                     yield os.path.join(dirpath, path)
+
+
+def get_corpus(name, niger_volta=False, **kwargs):
+    def file_or_dir(path, mode):
+        if mode == "single":
+            return Corpus(path=path, **kwargs)
+        else:
+            return DirectoryCorpus(path=path, **kwargs)
+
+    with open(os.path.join(os.path.dirname(__file__), "corpus.yml"), "r") as stream:
+        data = yaml.safe_load(stream)
+    if niger_volta:
+        nvc = data.get("niger_volta")
+        if name not in nvc.keys():
+            raise ValueError("Corpus {} does not exist".format(name))
+        else:
+            if not os.environ.get("NIGER_VOLTA_CORPUS", None):
+                raise NotADirectoryError(
+                    "NIGER_VOLTA_CORPUS environment variable not found. Please, clone the corpus repository from https://github.com/Niger-Volta-LTI/yoruba-text and set to NIGER_VOLTA_CORPUS to it's "
+                    "path"
+                )
+            path = os.path.join(os.environ["NIGER_VOLTA_CORPUS"], nvc[name]["path"])
+            return file_or_dir(path, nvc[name]["mode"])
+    else:
+        path = os.path.join(os.path.dirname(__file__), "corpus/{}".format(data["path"]))
+        return file_or_dir(path, data["mode"])
+
+
+def get_corpus_path(name):
+    with open(os.path.join(os.path.dirname(__file__), "corpus.yml"), "r") as stream:
+        data = yaml.safe_load(stream)
+        if name not in data.keys():
+            raise ValueError("Corpus {} does not exist".format(name))
+        else:
+            return os.path.join(os.path.dirname(__file__), data[name])
+
+
+def download_corpus(name, uri=None):
+    pass
diff --git a/src/iranlowo/corpus/corpus.yml b/src/iranlowo/corpus/corpus.yml
new file mode 100644
index 0000000..93dc8d9
--- /dev/null
+++ b/src/iranlowo/corpus/corpus.yml
@@ -0,0 +1,13 @@
+niger_volta:
+      yoruba_blog:
+          path: "TheYorubaBlog/theyorubablog_dot_com.txt"
+          mode: single
+      owe_yoruba:
+          path: " "
+          mode: dir
+      quran_mimo:
+          path: " "
+          mode: dir
+      asubiaro:
+          path: " "
+          mode: single
\ No newline at end of file
diff --git a/src/iranlowo/corpus/loaders.py b/src/iranlowo/corpus/loaders.py
index 1314af6..e82a9ed 100644
--- a/src/iranlowo/corpus/loaders.py
+++ b/src/iranlowo/corpus/loaders.py
@@ -1,66 +1,7 @@
 import os
 
-from iranlowo.corpus import Corpus, DirectoryCorpus
+from iranlowo.corpus import get_corpus
 
 
-class BaseLoader(object):
-    def __init__(self, corpus_path):
-        self.corpus_path = corpus_path
-        yoruba_text_path = os.environ.get("YORUBA_TEXT_PATH", None)
-        if not yoruba_text_path:
-            raise NotADirectoryError(
-                "YORUBA_TEXT_PATH environment variable not found. Please, clone the corpus repository from https://github.com/Niger-Volta-LTI/yoruba-text and set to YORUBA_TEXT_PATH to it's "
-                "path")
-        else:
-            corpus_path = "{}/{}".format(yoruba_text_path, corpus_path)
-            self.path = corpus_path
-
-
-class YorubaBlogCorpus(Corpus):
-    def __init__(self, path):
-        """
-
-        Args:
-            path:
-        """
-        super(YorubaBlogCorpus, self).__init__(path=self.path, **kwargs)
-
-
-class BBCCorpus(Corpus):
-    def __init__(self, path):
-        """
-
-        Args:
-            path:
-        """
-        super(BBCCorpus, self).__init__(path=self.path, **kwargs)
-        super().__init__(path)
-
-
-class BibeliCorpus(Corpus):
-    def __init__(self, path):
-        """
-
-        Args:
-            path:
-        """
-        super(BibeliCorpus, self).__init__(path=self.path, **kwargs)
-
-
-class en(BaseLoader, DirectoryCorpus):
-    def __init__(self):
-        BaseLoader.__init__(self, corpus_path="Owe/en")
-        DirectoryCorpus.__init__(self, path=self.path)
-
-
-class yo(BaseLoader, DirectoryCorpus):
-    def __init__(self):
-        BaseLoader.__init__(self, corpus_path="Owe/yo")
-        DirectoryCorpus.__init__(self, path=self.path)
-
-
-class OweLoader(object):
-    def __init__(self):
-        self.en = en()
-        self.yo = yo()
-
+def niger_volta_corpus(corpus_code):
+    return get_corpus(name=corpus_code, niger_volta=True)
diff --git a/src/iranlowo/corpus/scrapper.py b/src/iranlowo/corpus/scrapper.py
deleted file mode 100644
index e69de29..0000000
diff --git a/src/iranlowo/tokenizer.py b/src/iranlowo/tokenizer.py
new file mode 100644
index 0000000..d09560d
--- /dev/null
+++ b/src/iranlowo/tokenizer.py
@@ -0,0 +1,84 @@
+import gensim
+
+
+class Tokenizer(object):
+    def __init__(self, text, model=None, symbol=' ', func=None):
+        """
+
+        Args:
+            text:
+            model:
+            symbol:
+            func:
+        """
+        self.text = text
+        self.symbol = symbol
+        self.func = func
+        self.model = model
+
+    def ngram_tokenize(self):
+        pass
+
+    def word_tokenize(self, symbol=None, map_entities=False):
+        if map_entities:
+            email, num, link, abb = "<EMAIL>", "<NUM>", "<LINK>", "<ABB>"
+        if not symbol:
+            tokens = gensim.utils.simple_tokenize(self.text)
+        else:
+            tokens = [x for x in self.text]
+
+    def sentence_tokenize(self, min_words_to_split=10, min_words_in_utt=5):
+        output = []
+        for line in self.text.splitlines():
+            if self.symbol in line:
+                num_words = len(line.split())
+                num_commas = line.count(self.symbol)
+                curr_comma_position = line.index(self.symbol)
+                num_words_ahead_of_curr_comma = len(line[0:curr_comma_position].split())
+
+                curr_line = line
+                while num_commas > 0:
+                    if num_words < min_words_to_split:
+                        # print(curr_line.strip())
+                        output.append(curr_line)
+                        break
+                    if num_words >= min_words_to_split:
+                        if (
+                                num_words_ahead_of_curr_comma >= min_words_in_utt
+                                and len(curr_line[curr_comma_position:].split())
+                                >= min_words_in_utt
+                        ):
+                            output.append(curr_line[0:curr_comma_position] + "\n")
+
+                            # update vars
+                            curr_line = curr_line[curr_comma_position + 1:]
+                            num_words = len(curr_line.split())
+                            num_commas = num_commas - 1
+                            if num_commas > 0:
+                                curr_comma_position = curr_line.index(self.symbol)
+                                num_words_ahead_of_curr_comma = len(
+                                    curr_line[0:curr_comma_position].split()
+                                )
+                            else:
+                                output.append(curr_line)
+                        else:
+                            # ignore too short comma (+= vs = on current comma position)
+                            num_commas = num_commas - 1
+                            if num_commas > 0:  # for say 3 commas
+                                curr_comma_position += (
+                                        curr_line[curr_comma_position + 1:].index(self.symbol)
+                                        + 1
+                                )
+                                num_words_ahead_of_curr_comma = len(
+                                    curr_line[0:curr_comma_position].split()
+                                )
+                            else:
+                                output.append(curr_line)
+                    else:
+                        output.append(curr_line)
+            else:
+                output.append(line)
+        return output
+
+    def morph_tokenize(self):
+        pass
diff --git a/tests/test_loaders.py b/tests/test_loaders.py
index 7468a35..91f0264 100644
--- a/tests/test_loaders.py
+++ b/tests/test_loaders.py
@@ -1,12 +1,18 @@
 import unittest
 
-from iranlowo import corpus
+from iranlowo.corpus import loaders, corpus
 
 
 class TestCoprusLoader(unittest.TestCase):
-    def setUp(self):
-        self.owe_loader = corpus.OweLoader
+    def test_load_yoruba_blog(self):
+        yb = loaders.niger_volta_corpus('yoruba_blog')
+        self.assertIsInstance(yb, corpus.Corpus)
 
-    def test_load_owe(self):
+    def test_load_owe_empty(self):
         with self.assertRaises(NotADirectoryError):
-            self.owe_loader()
+            loaders.niger_volta_corpus('owe_yoruba')
+
+    def test_load_corpus_does_not_exist(self):
+        with self.assertRaises(ValueError):
+            loaders.niger_volta_corpus('owe')
+