Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Corpus Module Cleanup #19

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions docs/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Minimal makefile for Sphinx documentation
#

# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = .
BUILDDIR = _build

# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

.PHONY: help Makefile

# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
54 changes: 54 additions & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# http://www.sphinx-doc.org/en/master/config

# -- Path setup --------------------------------------------------------------

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# import os
# import sys
# sys.path.insert(0, os.path.abspath('.'))


# -- Project information -----------------------------------------------------

project = 'Iranlowo'
copyright = '2019, Ruoho Ruosi , Olamilekan Wahab'
author = 'Ruoho Ruosi , Olamilekan Wahab'

# The full version, including alpha/beta/rc tags
release = '0.1'


# -- General configuration ---------------------------------------------------

# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = ['recommonmark']

# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']

# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']


# -- Options for HTML output -------------------------------------------------

# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'alabaster'

# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
10 changes: 10 additions & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
.. Iranlowo documentation master file, created by
sphinx-quickstart on Sat Jul 6 09:15:49 2019.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.

.. include:: ../README.rst

.. toctree::
:maxdepth: 1
:caption: Contents:
35 changes: 35 additions & 0 deletions docs/make.bat
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
@ECHO OFF

pushd %~dp0

REM Command file for Sphinx documentation

if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=.
set BUILDDIR=_build

if "%1" == "" goto help

%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)

%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end

:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%

:end
popd
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
PyYAML
bs4
configargparse
torch
Expand Down
9 changes: 7 additions & 2 deletions src/iranlowo/corpus/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,7 @@
from .corpus import Corpus, DirectoryCorpus
from .loaders import OweLoader, YorubaBlogCorpus, BBCCorpus, BibeliCorpus
from .corpus import (
Corpus,
DirectoryCorpus,
get_corpus,
get_corpus_path,
download_corpus,
)
81 changes: 72 additions & 9 deletions src/iranlowo/corpus/corpus.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import gzip
import os

import yaml
from gensim import interfaces
from gensim.corpora.csvcorpus import CsvCorpus
from gensim.corpora.textcorpus import walk
Expand All @@ -10,7 +11,16 @@


class Corpus(interfaces.CorpusABC):
def __init__(self, path=None, text=None, stream=False, fformat='txt', cformat=None, labels=False, preprocess=None):
def __init__(
self,
path=None,
text=None,
stream=False,
fformat="txt",
cformat=None,
labels=False,
preprocess=None,
):
"""

Args:
Expand All @@ -24,10 +34,16 @@ def __init__(self, path=None, text=None, stream=False, fformat='txt', cformat=No
self.fformat = fformat
self.cformat = cformat
self.preprocess = preprocess
assert self.path or self.text, "You should pass either a path or text to read data from."
assert (
self.path or self.text
), "You should pass either a path or text to read data from."
if not self.preprocess:
self.preprocess = [normalize_diacritics_text]
self.data = self.read_file_filename_or_text(text=text) if text else self.read_file_filename_or_text()
self.data = (
self.read_file_filename_or_text(text=text)
if text
else self.read_file_filename_or_text()
)
self.validate_format()

def __iter__(self):
Expand Down Expand Up @@ -69,12 +85,14 @@ def read_file_filename_or_text(self, f=None, text=None):
text = open(path)
elif self.fformat == "csv":
text = CsvCorpus(path, self.labels)
elif self.fformat == 'gzip':
elif self.fformat == "gzip":
text = gzip.open(path)
else:
text = self.path.seek(0)

text = text.read() if not self.stream else ''.join(list(self.streamfile(text)))
text = (
text.read() if not self.stream else "".join(list(self.streamfile(text)))
)
return self.handle_preprocessing(text) if self.preprocess else text

def handle_preprocessing(self, text):
Expand All @@ -93,12 +111,16 @@ def validate_format(self):
"""
data = self.data
if isinstance(data, list):
data = ''.join(data)
data = "".join(data)
if not self.cformat and not is_text_nfc(data):
raise TypeError("The corpus does not comply to the NFC corpus format")
elif self.cformat == "owe":
if not is_valid_owé_format(data):
raise TypeError("The corpus does not comply to the {0} corpus format".format(self.cformat))
raise TypeError(
"The corpus does not comply to the {0} corpus format".format(
self.cformat
)
)
else:
return True

Expand All @@ -118,15 +140,56 @@ def generate(self, size):
class DirectoryCorpus(Corpus):
def __init__(self, path, **kwargs):
self.dir_path = path
self.depth = kwargs.get('min_depth', 0)
self.depth = kwargs.get("min_depth", 0)
self.path = list(self.read_files())
super(DirectoryCorpus, self).__init__(path=self.path, **kwargs)

def read_files(self):
walked = list(walk(self.dir_path))
if not walked:
raise NotADirectoryError("'{}' is not a valid directory".format(self.dir_path))
raise NotADirectoryError(
"'{}' is not a valid directory".format(self.dir_path)
)
for depth, dirpath, _, filenames in walked:
if self.depth <= depth:
for path in filenames:
yield os.path.join(dirpath, path)


def get_corpus(name, niger_volta=False, **kwargs):
def file_or_dir(path, mode):
if mode == "single":
return Corpus(path=path, **kwargs)
else:
return DirectoryCorpus(path=path, **kwargs)

with open(os.path.join(os.path.dirname(__file__), "corpus.yml"), "r") as stream:
data = yaml.safe_load(stream)
if niger_volta:
nvc = data.get("niger_volta")
if name not in nvc.keys():
raise ValueError("Corpus {} does not exist".format(name))
else:
if not os.environ.get("NIGER_VOLTA_CORPUS", None):
raise NotADirectoryError(
"NIGER_VOLTA_CORPUS environment variable not found. Please, clone the corpus repository from https://github.com/Niger-Volta-LTI/yoruba-text and set to NIGER_VOLTA_CORPUS to it's "
"path"
)
path = os.path.join(os.environ["NIGER_VOLTA_CORPUS"], nvc[name]["path"])
return file_or_dir(path, nvc[name]["mode"])
else:
path = os.path.join(os.path.dirname(__file__), "corpus/{}".format(data["path"]))
return file_or_dir(path, data["mode"])


def get_corpus_path(name):
with open(os.path.join(os.path.dirname(__file__), "corpus.yml"), "r") as stream:
data = yaml.safe_load(stream)
if name not in data.keys():
raise ValueError("Corpus {} does not exist".format(name))
else:
return os.path.join(os.path.dirname(__file__), data[name])


def download_corpus(name, uri=None):
pass
13 changes: 13 additions & 0 deletions src/iranlowo/corpus/corpus.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
niger_volta:
yoruba_blog:
path: "TheYorubaBlog/theyorubablog_dot_com.txt"
mode: single
owe_yoruba:
path: " "
mode: dir
quran_mimo:
path: " "
mode: dir
asubiaro:
path: " "
mode: single
65 changes: 3 additions & 62 deletions src/iranlowo/corpus/loaders.py
Original file line number Diff line number Diff line change
@@ -1,66 +1,7 @@
import os

from iranlowo.corpus import Corpus, DirectoryCorpus
from iranlowo.corpus import get_corpus


class BaseLoader(object):
def __init__(self, corpus_path):
self.corpus_path = corpus_path
yoruba_text_path = os.environ.get("YORUBA_TEXT_PATH", None)
if not yoruba_text_path:
raise NotADirectoryError(
"YORUBA_TEXT_PATH environment variable not found. Please, clone the corpus repository from https://github.com/Niger-Volta-LTI/yoruba-text and set to YORUBA_TEXT_PATH to it's "
"path")
else:
corpus_path = "{}/{}".format(yoruba_text_path, corpus_path)
self.path = corpus_path


class YorubaBlogCorpus(Corpus):
def __init__(self, path):
"""

Args:
path:
"""
super(YorubaBlogCorpus, self).__init__(path=self.path, **kwargs)


class BBCCorpus(Corpus):
def __init__(self, path):
"""

Args:
path:
"""
super(BBCCorpus, self).__init__(path=self.path, **kwargs)
super().__init__(path)


class BibeliCorpus(Corpus):
def __init__(self, path):
"""

Args:
path:
"""
super(BibeliCorpus, self).__init__(path=self.path, **kwargs)


class en(BaseLoader, DirectoryCorpus):
def __init__(self):
BaseLoader.__init__(self, corpus_path="Owe/en")
DirectoryCorpus.__init__(self, path=self.path)


class yo(BaseLoader, DirectoryCorpus):
def __init__(self):
BaseLoader.__init__(self, corpus_path="Owe/yo")
DirectoryCorpus.__init__(self, path=self.path)


class OweLoader(object):
def __init__(self):
self.en = en()
self.yo = yo()

def niger_volta_corpus(corpus_code):
return get_corpus(name=corpus_code, niger_volta=True)
Empty file removed src/iranlowo/corpus/scrapper.py
Empty file.
Loading