diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..439f62c --- /dev/null +++ b/Makefile @@ -0,0 +1,62 @@ +define BROWSER_PYSCRIPT +import os, webbrowser, sys + +from urllib.request import pathname2url + +webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1]))) +endef +export BROWSER_PYSCRIPT + +BROWSER := python -c "$$BROWSER_PYSCRIPT" +sources = opendatasets + +.PHONY: test format lint unittest coverage pre-commit clean +clean-build: ## remove build artifacts + rm -fr build/ + rm -fr dist/ + rm -fr .eggs/ + find . -name '*.egg-info' -exec rm -fr {} + + find . -name '*.egg' -exec rm -f {} + + +clean-pyc: ## remove Python file artifacts + find . -name '*.pyc' -exec rm -f {} + + find . -name '*.pyo' -exec rm -f {} + + find . -name '*~' -exec rm -f {} + + find . -name '__pycache__' -exec rm -fr {} + + +test: lint unittest + +format: + isort $(sources) tests + black $(sources) tests + +lint: + flake8 $(sources) tests + # mypy $(sources) tests + +unittest: clean + pytest + +cov: + pytest --cov=$(sources) --cov-branch --cov-report=term-missing tests + coverage report -m + coverage html + $(BROWSER) htmlcov/index.html + +pre-commit: + pre-commit run --all-files + +clean-cache: + rm -rf .mypy_cache .pytest_cache + rm -rf *.egg-info + rm -rf .tox dist site + rm -rf coverage.xml .coverage + rm -rf output/*.* + +clean: clean-build clean-pyc clean-cache + +dist: clean + poetry build + +release: dist ## package and upload a release + twine upload dist/* diff --git a/README.md b/README.md index 7306283..e8b091e 100644 --- a/README.md +++ b/README.md @@ -187,8 +187,12 @@ git clone https://github.com/JovianML/opendatasets.git conda create -n opendatasets python=3.5 conda activate opendatasets pip install -r requirements.txt +pip install -r requirements_dev.txt ``` 3. Open up the project in VS code and make your changes. Make sure to install the Python Extension for VS Code and select the `opendatasets` conda environment. +### Testing + + This package is developed and maintained by the [Jovian](https://www.jovian.ai) team. diff --git a/opendatasets/__init__.py b/opendatasets/__init__.py index 16ae539..54009e6 100644 --- a/opendatasets/__init__.py +++ b/opendatasets/__init__.py @@ -1,9 +1,11 @@ import importlib + +from opendatasets.utils.kaggle_direct import is_kaggle_url from opendatasets.utils.network import download_url, is_url from opendatasets.utils.googledrive import is_google_drive_url, download_google_drive import os from opendatasets._version import __version__ -from opendatasets.utils.kaggle_api import download_kaggle_dataset, is_kaggle_url +from opendatasets.utils.kaggle_api import download_kaggle_dataset from opendatasets.utils.archive import extract_archive diff --git a/opendatasets/datasets/owid-covid-19-latest/__init__.py b/opendatasets/datasets/owid-covid-19-latest/__init__.py index e410107..99910f3 100644 --- a/opendatasets/datasets/owid-covid-19-latest/__init__.py +++ b/opendatasets/datasets/owid-covid-19-latest/__init__.py @@ -1,8 +1,10 @@ from opendatasets.utils import download_url +last_updated = 'https://raw.githubusercontent.com/owid/covid-19-data/master/public/data' \ + '/owid-covid-data-last-updated-timestamp.txt' URLs = [ 'https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv', - 'https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data-last-updated-timestamp.txt', + last_updated, 'https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-codebook.csv' ] diff --git a/opendatasets/utils/googledrive.py b/opendatasets/utils/googledrive.py index af76fab..bf49c47 100644 --- a/opendatasets/utils/googledrive.py +++ b/opendatasets/utils/googledrive.py @@ -1,12 +1,13 @@ -from opendatasets.utils.network import get_filename_cd import os import re import zipfile -import cgi -from opendatasets.utils.md5 import check_integrity from urllib.parse import urlparse + from tqdm import tqdm +from opendatasets.utils.md5 import check_integrity +from opendatasets.utils.network import get_filename_cd + def download_google_drive(url, data_dir): print('Downloading from Google Drive (may take a while):', url) @@ -28,7 +29,8 @@ def download_google_drive(url, data_dir): def is_google_drive_url(url): - return url.startswith('https://drive.google.com') or url.startswith('http://drive.google.com') or url.startswith('drive.google.com') + return url.startswith('https://drive.google.com') or url.startswith('http://drive.google.com') \ + or url.startswith('drive.google.com') def _get_google_drive_file_id(url): @@ -43,6 +45,7 @@ def _get_google_drive_file_id(url): return match.group("id") + def _quota_exceeded(response): return "Google Drive - Quota exceeded" in response.text @@ -88,9 +91,9 @@ def download_file_from_google_drive(file_id, root, filename=None, md5=None): if _quota_exceeded(response): msg = ( - "The daily quota of the file " + filename + " is exceeded and it " + - "can't be downloaded. This is a limitation of Google Drive " + - "and can only be overcome by trying again later." + "The daily quota of the file " + filename + " is exceeded and it " + + "can't be downloaded. This is a limitation of Google Drive " + + "and can only be overcome by trying again later." ) raise RuntimeError(msg) diff --git a/opendatasets/utils/kaggle_api.py b/opendatasets/utils/kaggle_api.py index 82361c8..d5e0e02 100644 --- a/opendatasets/utils/kaggle_api.py +++ b/opendatasets/utils/kaggle_api.py @@ -1,8 +1,11 @@ +import json import os -from opendatasets.utils.kaggle_direct import get_kaggle_dataset_id, is_kaggle_url -from opendatasets.utils.archive import extract_archive +from pathlib import Path + import click -import json + +from opendatasets.utils.archive import extract_archive +from opendatasets.utils.kaggle_direct import get_kaggle_dataset_id def _get_kaggle_key(): @@ -17,16 +20,20 @@ def _get_kaggle_key(): return user_input -def read_kaggle_creds(): +def read_kaggle_creds(credentials_file: Path | None = None) -> bool: try: - if os.path.exists('./kaggle.json'): - with open('./kaggle.json', 'r') as f: + if credentials_file is None: + credentials_file = Path('./kaggle.json') + if credentials_file.exists(): + with open(credentials_file, 'r') as f: key = f.read() data = json.loads(key) if 'username' in data and 'key' in data: os.environ['KAGGLE_USERNAME'] = data['username'] os.environ['KAGGLE_KEY'] = data['key'] return True + else: + return False except Exception: return False diff --git a/opendatasets/utils/kaggle_direct.py b/opendatasets/utils/kaggle_direct.py index b8da598..83a21e1 100644 --- a/opendatasets/utils/kaggle_direct.py +++ b/opendatasets/utils/kaggle_direct.py @@ -22,7 +22,7 @@ def get_kaggle_dataset_id(dataset_id_or_url): elif not is_url(dataset_id_or_url): parts = dataset_id_or_url.split('/')[:2] assert len(parts) == 2, 'Invalid Kaggle dataset URL or ID: ' + \ - dataset_id_or_url + dataset_id_or_url return '/'.join(parts) @@ -37,10 +37,8 @@ def get_kaggle_download_hash(dataset_id): def download_kaggle_dataset(dataset_url, data_dir='.', force=True, dry_run=False): dataset_id = get_kaggle_dataset_id(dataset_url) print('Kaggle dataset ID: ', dataset_id) - raw_dataset_url = ('https://www.kaggle.com/' + - dataset_id + - '/download?resource=download&downloadHash=' + - get_kaggle_download_hash(dataset_id)) + raw_dataset_url = (f'https://www.kaggle.com/{dataset_id}/download?resource' + f'=download&downloadHash={get_kaggle_download_hash(dataset_id)}') folder_name = dataset_id.split('/')[-1] archive_name = folder_name + '.zip' download_url(raw_dataset_url, root=data_dir, diff --git a/opendatasets/utils/network.py b/opendatasets/utils/network.py index 0098eab..2695774 100644 --- a/opendatasets/utils/network.py +++ b/opendatasets/utils/network.py @@ -1,33 +1,39 @@ -import os import cgi +import os import re + from tqdm import tqdm + from opendatasets.utils.md5 import check_integrity -import urllib try: import urllib.request as request + urlopen = request.urlopen except Exception: # For Python 2.7 import urllib + urlopen = urllib.urlopen try: import urllib.request as request + urlretrieve = request.urlretrieve except Exception: import urllib + # For Python 2.7 urlretrieve = urllib.urlretrieve def download_url(url, root, filename=None, md5=None, force=False, dry_run=False): """Download a file from a url and place it in root. + Args: url (str): URL to download file from root (str): Directory to place downloaded file in - filename (str, optional): + filename (str, optional): Name of the file Name to save the file under. If None, use the basename of the URL md5 (str, optional): MD5 checksum of the download. If None, do not check """ @@ -40,13 +46,13 @@ def download_url(url, root, filename=None, md5=None, force=False, dry_run=False) filename = os.path.basename(url) fpath = os.path.join(root, filename) - if not(os.path.exists(root)): + if not (os.path.exists(root)): os.makedirs(root) # check if file is already present locally if not force and check_integrity(fpath, md5): print('Using downloaded and verified file: ' + fpath) - else: # download the file + else: # download the file try: print('Downloading ' + url + ' to ' + fpath) if not dry_run: diff --git a/requirements_dev.txt b/requirements_dev.txt new file mode 100644 index 0000000..a099c77 --- /dev/null +++ b/requirements_dev.txt @@ -0,0 +1,3 @@ +pytest==7.2.2 +pytest-mock==3.10.0 +mypy==1.2.0 diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..a424b25 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,40 @@ +[flake8] +max-line-length = 120 +max-complexity = 18 +ignore = E203, E266, W503 +docstring-convention = google +per-file-ignores = __init__.py:F401 +exclude = .git, + __pycache__, + setup.py, + build, + dist, + docs, + releases, + .venv, + .tox, + .mypy_cache, + .pytest_cache, + .vscode, + .github, + # By default test codes will be linted. + # tests + +[mypy] +ignore_missing_imports = True + +[coverage:run] +# uncomment the following to omit files during running +#omit = +[coverage:report] +exclude_lines = + pragma: no cover + def __repr__ + if self.debug: + if settings.DEBUG + raise AssertionError + raise NotImplementedError + if 0: + if __name__ == .__main__.: + def main + diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..1c44764 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,9 @@ +from pathlib import Path + +import pytest + + +@pytest.fixture(scope='session') +def fixtures_folder(): + folder = Path(__file__).parent / 'fixtures' + return folder diff --git a/tests/fixtures/kaggle.json b/tests/fixtures/kaggle.json new file mode 100644 index 0000000..df2e728 --- /dev/null +++ b/tests/fixtures/kaggle.json @@ -0,0 +1 @@ +{"username": "wintersoldier", "key": "adamantium22"} \ No newline at end of file diff --git a/tests/utils/__init__.py b/tests/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/utils/test_kaggle_api.py b/tests/utils/test_kaggle_api.py new file mode 100644 index 0000000..917ddf1 --- /dev/null +++ b/tests/utils/test_kaggle_api.py @@ -0,0 +1,46 @@ +import json +import os +from pathlib import Path + +from opendatasets.utils.kaggle_api import read_kaggle_creds + + +def test_read_kaggle_creds(): + found = read_kaggle_creds() + assert not found + + +def test_read_kaggle_creds_file_found(): + """Creates a local kaggle credential file and verifies that the credentials are loaded""" + credentials_file = Path(__file__).cwd() / 'kaggle.json' + credentials_file.unlink(missing_ok=True) + + os.environ['KAGGLE_USERNAME'] = '' + os.environ['KAGGLE_KEY'] = '' + + kaggle_data = {'username': 'wolverine', 'key': 'adamantium11'} + with open(credentials_file, 'w') as f: + json.dump(kaggle_data, f) + + assert credentials_file.exists() + + found = read_kaggle_creds() + assert found + assert os.environ['KAGGLE_USERNAME'] == kaggle_data['username'] + assert os.environ['KAGGLE_KEY'] == kaggle_data['key'] + + credentials_file.unlink() + + +def test_read_kaggle_creds_custom_credentials(fixtures_folder): + credentials_file = fixtures_folder / 'kaggle.json' + + os.environ['KAGGLE_USERNAME'] = '' + os.environ['KAGGLE_KEY'] = '' + + assert credentials_file.exists() + + found = read_kaggle_creds(credentials_file=credentials_file) + assert found + assert os.environ['KAGGLE_USERNAME'] == 'wintersoldier' + assert os.environ['KAGGLE_KEY'] == 'adamantium22'