Skip to content

Commit

Permalink
Merge pull request #2 from datopian/feature/filestore_resource_upload
Browse files Browse the repository at this point in the history
Upload archived resources to filestore
  • Loading branch information
zelima committed Sep 17, 2020
2 parents ddfe346 + 77e0399 commit 93d5aad
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 5 deletions.
15 changes: 15 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,21 @@ Config settings
root /www/resource_cache;
}

6. Alternatively, you can upload archived resources to s3filestore. Make sure that you
have properly setup `ckanext-s3filestore <https://github.com/datopian/ckanext-s3filestore>`_.

Add the following values to the CKAN config file:

* ``ckanext.archiver.s3upload_enable`` = ``True`` to enable upload to cloud storage, defaults to false.
* ``ckanext.s3filestore.aws_storage_path`` = ``my-site-name``. Your filestore project path. For example ckan/storage_path/archived_resource_dir. This is required to upload the archived resources.

The resources are uploaded to s3filestore in the directory ``s3filestore.aws_bucket_name/s3filestore.aws_storage_path/archived_resources/resource_id/``.

A cron job must be run atleast once a week to update archived resources and generate a presigned url to download the resources from the ``s3filestore``. The presigned url expires after 7 days(604800s) of running the ``archiver update`` command.::
0 0 * * 0 paster --plugin=ckanext-archiver archiver update -c /srv/app/production.ini


Legacy settings
~~~~~~~~~~~~~~~

Expand Down
2 changes: 0 additions & 2 deletions ckanext/archiver/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,8 +149,6 @@ def register_reports(self):
def update_config(self, config):
p.toolkit.add_template_directory(config, 'templates')
archive_dir = config.get('ckanext.archiver.archive_dir')
if archive_dir:
p.toolkit.add_public_directory(config, archive_dir)

# IActions

Expand Down
55 changes: 52 additions & 3 deletions ckanext/archiver/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

from requests.packages import urllib3

from ckan.common import _
from ckan.common import _ , config
from ckan.lib.celery_app import celery
from ckan.lib import uploader
from ckan import plugins as p
Expand Down Expand Up @@ -548,11 +548,60 @@ def archive_resource(context, resource, log, result=None, url_timeout=30):
log.warning('Not saved cache_url because no value for '
'ckanext.archiver.cache_url_root in config')
raise ArchiveError(_('No value for ckanext.archiver.cache_url_root in config'))
cache_url = urlparse.urljoin(context['cache_url_root'],
'%s/%s' % (relative_archive_path, file_name))

archiver_s3upload_enable = config.get('ckanext.archiver.s3upload_enable')
resource_id_dir = relative_archive_path.split('/')[-1]

if archiver_s3upload_enable:
upload_obj, key_path = upload_archived_resource(resource_id_dir, file_name, saved_file)
cache_url = generate_cache_url(upload_obj, key_path)
else:
cache_url = urlparse.urljoin(context['cache_url_root'],
'%s/%s' % (relative_archive_path, file_name))

return {'cache_filepath': saved_file,
'cache_url': cache_url}

def upload_archived_resource(resource_id_dir, filename, saved_file):
'''
Uploads the resources to s3filestore in directory
<S3FILESTORE__AWS_BUCKET_NAME>/<S3FILESTORE__AWS_STORAGE_PATH>/archived_resources/
'''

storage_path = config.get('ckanext.s3filestore.aws_storage_path')

if not storage_path:
log.warning('Not saved to filestore because no value for '
'ckanext.s3filestore.aws_storage_path in config')
raise ArchiveError(_('No value for ckanext.s3filestore.aws_storage_path in config'))

with open (saved_file, 'rb') as save_file:
upload = uploader.get_uploader('archived_resources')
upload.upload_file = save_file
upload.filename = filename
upload.filepath = os.path.join(storage_path, 'archived_resources', resource_id_dir, filename)
upload.id = filename
upload.clear = False
upload.upload(uploader.get_max_resource_size())

return upload, upload.filepath

def generate_cache_url(upload_obj, key_path):
'''
Generates a presigned url to download the resource from the s3filestore
which expires after 1day(86400s) and returns that as cache_url
'''
bucket_name = config.get('ckanext.s3filestore.aws_bucket_name')
region = config.get('ckanext.s3filestore.region_name')
host_name = config.get('ckanext.s3filestore.host_name')
bucket = upload_obj.get_s3_bucket(bucket_name)
s3 = upload_obj.get_s3_session()
client = s3.client(service_name='s3', endpoint_url=host_name)
cache_url = client.generate_presigned_url(ClientMethod='get_object',
Params={'Bucket':bucket.name,'Key':key_path},
ExpiresIn=604800)

return cache_url

def notify_resource(resource, queue, cache_filepath):
'''
Expand Down

0 comments on commit 93d5aad

Please sign in to comment.