Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added custom path for the kaggle credentials file. #13

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 62 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
define BROWSER_PYSCRIPT
import os, webbrowser, sys

from urllib.request import pathname2url

webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1])))
endef
export BROWSER_PYSCRIPT

BROWSER := python -c "$$BROWSER_PYSCRIPT"
sources = opendatasets

.PHONY: test format lint unittest coverage pre-commit clean
clean-build: ## remove build artifacts
rm -fr build/
rm -fr dist/
rm -fr .eggs/
find . -name '*.egg-info' -exec rm -fr {} +
find . -name '*.egg' -exec rm -f {} +

clean-pyc: ## remove Python file artifacts
find . -name '*.pyc' -exec rm -f {} +
find . -name '*.pyo' -exec rm -f {} +
find . -name '*~' -exec rm -f {} +
find . -name '__pycache__' -exec rm -fr {} +

test: lint unittest

format:
isort $(sources) tests
black $(sources) tests

lint:
flake8 $(sources) tests
# mypy $(sources) tests

unittest: clean
pytest

cov:
pytest --cov=$(sources) --cov-branch --cov-report=term-missing tests
coverage report -m
coverage html
$(BROWSER) htmlcov/index.html

pre-commit:
pre-commit run --all-files

clean-cache:
rm -rf .mypy_cache .pytest_cache
rm -rf *.egg-info
rm -rf .tox dist site
rm -rf coverage.xml .coverage
rm -rf output/*.*

clean: clean-build clean-pyc clean-cache

dist: clean
poetry build

release: dist ## package and upload a release
twine upload dist/*
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -187,8 +187,12 @@ git clone https://github.com/JovianML/opendatasets.git
conda create -n opendatasets python=3.5
conda activate opendatasets
pip install -r requirements.txt
pip install -r requirements_dev.txt
```

3. Open up the project in VS code and make your changes. Make sure to install the Python Extension for VS Code and select the `opendatasets` conda environment.

### Testing


This package is developed and maintained by the [Jovian](https://www.jovian.ai) team.
4 changes: 3 additions & 1 deletion opendatasets/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import importlib

from opendatasets.utils.kaggle_direct import is_kaggle_url
from opendatasets.utils.network import download_url, is_url
from opendatasets.utils.googledrive import is_google_drive_url, download_google_drive
import os
from opendatasets._version import __version__
from opendatasets.utils.kaggle_api import download_kaggle_dataset, is_kaggle_url
from opendatasets.utils.kaggle_api import download_kaggle_dataset
from opendatasets.utils.archive import extract_archive


Expand Down
4 changes: 3 additions & 1 deletion opendatasets/datasets/owid-covid-19-latest/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from opendatasets.utils import download_url

last_updated = 'https://raw.githubusercontent.com/owid/covid-19-data/master/public/data' \
'/owid-covid-data-last-updated-timestamp.txt'
URLs = [
'https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv',
'https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data-last-updated-timestamp.txt',
last_updated,
'https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-codebook.csv'
]

Expand Down
17 changes: 10 additions & 7 deletions opendatasets/utils/googledrive.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
from opendatasets.utils.network import get_filename_cd
import os
import re
import zipfile
import cgi
from opendatasets.utils.md5 import check_integrity
from urllib.parse import urlparse

from tqdm import tqdm

from opendatasets.utils.md5 import check_integrity
from opendatasets.utils.network import get_filename_cd


def download_google_drive(url, data_dir):
print('Downloading from Google Drive (may take a while):', url)
Expand All @@ -28,7 +29,8 @@ def download_google_drive(url, data_dir):


def is_google_drive_url(url):
return url.startswith('https://drive.google.com') or url.startswith('http://drive.google.com') or url.startswith('drive.google.com')
return url.startswith('https://drive.google.com') or url.startswith('http://drive.google.com') \
or url.startswith('drive.google.com')


def _get_google_drive_file_id(url):
Expand All @@ -43,6 +45,7 @@ def _get_google_drive_file_id(url):

return match.group("id")


def _quota_exceeded(response):
return "Google Drive - Quota exceeded" in response.text

Expand Down Expand Up @@ -88,9 +91,9 @@ def download_file_from_google_drive(file_id, root, filename=None, md5=None):

if _quota_exceeded(response):
msg = (
"The daily quota of the file " + filename + " is exceeded and it " +
"can't be downloaded. This is a limitation of Google Drive " +
"and can only be overcome by trying again later."
"The daily quota of the file " + filename + " is exceeded and it " +
"can't be downloaded. This is a limitation of Google Drive " +
"and can only be overcome by trying again later."
)
raise RuntimeError(msg)

Expand Down
19 changes: 13 additions & 6 deletions opendatasets/utils/kaggle_api.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import json
import os
from opendatasets.utils.kaggle_direct import get_kaggle_dataset_id, is_kaggle_url
from opendatasets.utils.archive import extract_archive
from pathlib import Path

import click
import json

from opendatasets.utils.archive import extract_archive
from opendatasets.utils.kaggle_direct import get_kaggle_dataset_id


def _get_kaggle_key():
Expand All @@ -17,16 +20,20 @@ def _get_kaggle_key():
return user_input


def read_kaggle_creds():
def read_kaggle_creds(credentials_file: Path | None = None) -> bool:
try:
if os.path.exists('./kaggle.json'):
with open('./kaggle.json', 'r') as f:
if credentials_file is None:
credentials_file = Path('./kaggle.json')
if credentials_file.exists():
with open(credentials_file, 'r') as f:
key = f.read()
data = json.loads(key)
if 'username' in data and 'key' in data:
os.environ['KAGGLE_USERNAME'] = data['username']
os.environ['KAGGLE_KEY'] = data['key']
return True
else:
return False
except Exception:
return False

Expand Down
8 changes: 3 additions & 5 deletions opendatasets/utils/kaggle_direct.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def get_kaggle_dataset_id(dataset_id_or_url):
elif not is_url(dataset_id_or_url):
parts = dataset_id_or_url.split('/')[:2]
assert len(parts) == 2, 'Invalid Kaggle dataset URL or ID: ' + \
dataset_id_or_url
dataset_id_or_url
return '/'.join(parts)


Expand All @@ -37,10 +37,8 @@ def get_kaggle_download_hash(dataset_id):
def download_kaggle_dataset(dataset_url, data_dir='.', force=True, dry_run=False):
dataset_id = get_kaggle_dataset_id(dataset_url)
print('Kaggle dataset ID: ', dataset_id)
raw_dataset_url = ('https://www.kaggle.com/' +
dataset_id +
'/download?resource=download&downloadHash=' +
get_kaggle_download_hash(dataset_id))
raw_dataset_url = (f'https://www.kaggle.com/{dataset_id}/download?resource'
f'=download&downloadHash={get_kaggle_download_hash(dataset_id)}')
folder_name = dataset_id.split('/')[-1]
archive_name = folder_name + '.zip'
download_url(raw_dataset_url, root=data_dir,
Expand Down
16 changes: 11 additions & 5 deletions opendatasets/utils/network.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,39 @@
import os
import cgi
import os
import re

from tqdm import tqdm

from opendatasets.utils.md5 import check_integrity
import urllib

try:
import urllib.request as request

urlopen = request.urlopen
except Exception:
# For Python 2.7
import urllib

urlopen = urllib.urlopen

try:
import urllib.request as request

urlretrieve = request.urlretrieve
except Exception:
import urllib

# For Python 2.7
urlretrieve = urllib.urlretrieve


def download_url(url, root, filename=None, md5=None, force=False, dry_run=False):
"""Download a file from a url and place it in root.

Args:
url (str): URL to download file from
root (str): Directory to place downloaded file in
filename (str, optional):
filename (str, optional): Name of the file
Name to save the file under. If None, use the basename of the URL
md5 (str, optional): MD5 checksum of the download. If None, do not check
"""
Expand All @@ -40,13 +46,13 @@ def download_url(url, root, filename=None, md5=None, force=False, dry_run=False)
filename = os.path.basename(url)
fpath = os.path.join(root, filename)

if not(os.path.exists(root)):
if not (os.path.exists(root)):
os.makedirs(root)

# check if file is already present locally
if not force and check_integrity(fpath, md5):
print('Using downloaded and verified file: ' + fpath)
else: # download the file
else: # download the file
try:
print('Downloading ' + url + ' to ' + fpath)
if not dry_run:
Expand Down
3 changes: 3 additions & 0 deletions requirements_dev.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
pytest==7.2.2
pytest-mock==3.10.0
mypy==1.2.0
40 changes: 40 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
[flake8]
max-line-length = 120
max-complexity = 18
ignore = E203, E266, W503
docstring-convention = google
per-file-ignores = __init__.py:F401
exclude = .git,
__pycache__,
setup.py,
build,
dist,
docs,
releases,
.venv,
.tox,
.mypy_cache,
.pytest_cache,
.vscode,
.github,
# By default test codes will be linted.
# tests

[mypy]
ignore_missing_imports = True

[coverage:run]
# uncomment the following to omit files during running
#omit =
[coverage:report]
exclude_lines =
pragma: no cover
def __repr__
if self.debug:
if settings.DEBUG
raise AssertionError
raise NotImplementedError
if 0:
if __name__ == .__main__.:
def main

Empty file added tests/__init__.py
Empty file.
9 changes: 9 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from pathlib import Path

import pytest


@pytest.fixture(scope='session')
def fixtures_folder():
folder = Path(__file__).parent / 'fixtures'
return folder
1 change: 1 addition & 0 deletions tests/fixtures/kaggle.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"username": "wintersoldier", "key": "adamantium22"}
Empty file added tests/utils/__init__.py
Empty file.
46 changes: 46 additions & 0 deletions tests/utils/test_kaggle_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import json
import os
from pathlib import Path

from opendatasets.utils.kaggle_api import read_kaggle_creds


def test_read_kaggle_creds():
found = read_kaggle_creds()
assert not found


def test_read_kaggle_creds_file_found():
"""Creates a local kaggle credential file and verifies that the credentials are loaded"""
credentials_file = Path(__file__).cwd() / 'kaggle.json'
credentials_file.unlink(missing_ok=True)

os.environ['KAGGLE_USERNAME'] = ''
os.environ['KAGGLE_KEY'] = ''

kaggle_data = {'username': 'wolverine', 'key': 'adamantium11'}
with open(credentials_file, 'w') as f:
json.dump(kaggle_data, f)

assert credentials_file.exists()

found = read_kaggle_creds()
assert found
assert os.environ['KAGGLE_USERNAME'] == kaggle_data['username']
assert os.environ['KAGGLE_KEY'] == kaggle_data['key']

credentials_file.unlink()


def test_read_kaggle_creds_custom_credentials(fixtures_folder):
credentials_file = fixtures_folder / 'kaggle.json'

os.environ['KAGGLE_USERNAME'] = ''
os.environ['KAGGLE_KEY'] = ''

assert credentials_file.exists()

found = read_kaggle_creds(credentials_file=credentials_file)
assert found
assert os.environ['KAGGLE_USERNAME'] == 'wintersoldier'
assert os.environ['KAGGLE_KEY'] == 'adamantium22'