diff --git a/.gitignore b/.gitignore index 8570dc5..71000f6 100644 --- a/.gitignore +++ b/.gitignore @@ -40,3 +40,6 @@ coverage.xml # Sphinx documentation docs/_build/ + +# Generated sitemaps (default directory) +ckanext/sitemap/public/sitemap* \ No newline at end of file diff --git a/README.md b/README.md index 4c31804..f8e7420 100644 --- a/README.md +++ b/README.md @@ -1,26 +1,36 @@ [![Tests](https://github.com//ckanext-sitemap/workflows/Tests/badge.svg?branch=main)](https://github.com//ckanext-sitemap/actions) # ckanext-sitemap -A CKAN extension that generates a sitemap XML file is designed to create a structured map of a CKAN instance's datasets and resources, making it easier for search engines to discover and index the available data. ! -## Installation +A CKAN extension that generates a sitemap XML file is designed to create a structured map of a CKAN instance's datasets and resources, making it easier for search engines to discover and index the available data. -**TODO:** Add any additional install steps to the list below. - For example installing any non-Python dependencies or adding any required - config settings. +## Table of Contents + +- [Getting Started](#getting-started) +- [Contributing](#contributing) +- [Versioning](#versioning) +- [License](#license) + +## Getting Started + +### Installation To install ckanext-sitemap: 1. Activate your CKAN virtual environment, for example: - . /usr/lib/ckan/default/bin/activate + ```bash + . /usr/lib/ckan/default/bin/activate + ``` -2. Clone the source and install it on the virtualenv +2. Clone the source and install it in the virtual environment + ```bash git clone https://github.com//ckanext-sitemap.git cd ckanext-sitemap pip install -e . - pip install -r requirements.txt + pip install -r requirements.txt + ``` 3. Add `sitemap` to the `ckan.plugins` setting in your CKAN config file (by default the config file is located at @@ -28,7 +38,60 @@ To install ckanext-sitemap: 4. Restart CKAN. For example if you've deployed CKAN with Apache on Ubuntu: - sudo service apache2 reload + ```bash + sudo service apache2 reload + ``` + +### Configuration + +You can configure this extension in the `ckan.ini` file of your CKAN instance. Ensure to set these environment variables according to your requirements for sitemap generation and management. + +Environment Variable | Default Value | Description +-------------------- | ------------- | ----------- +`ckanext.sitemap.directory` | [`./ckanext/sitemap/public`](./ckanext/sitemap/public/) | The directory path for storing generated sitemaps. +`ckanext.sitemap.max_items` | `5000` | Maximum number of items per sitemap file. If the total count of resources exceeds this limit, the sitemap is split into multiple files. +`ckanext.sitemap.autorenew` | `True` | If this option is enabled, the sitemaps will be automatically renewed whenever a user requests a sitemap and the existing sitemap is older than the Time-To-Live (TTL) value specified. Set this to False if you prefer a cron job to handle sitemap generation. +`ckanext.sitemap.ttl` | `8 * 3600` (8 hours) | Time-To-Live (TTL) for sitemaps. Sitemaps older than this value (in seconds) are regenerated when a user visits a sitemap route. +`ckanext.sitemap.resources` | `True` | Determines whether package resources (distributions) should be included in the sitemaps. +`ckanext.sitemap.groups` | `True` | Determines whether groups and organizations should be included in the sitemaps. +`ckanext.sitemap.language_alternatives` | `True` | Determines whether language alternatives should be included in the sitemaps. +`ckanext.sitemap.custom_uris` | `Undefined` | A list of additional sitemap URIs separated by whitespace or newlines. These URIs will be included in the sitemap generation process alongside the default CKAN URIs. + +### Using Cron for Regular Sitemap Generation + +Using cron to generate sitemaps regularly can be advantageous, especially if the sitemap generation process is time-consuming. + +Ensure that the sitemap generation occurs within the time frame specified by `ckanext.sitemap.ttl`, or alternatively, set `ckanext.sitemap.autorenew` to `False` to prevent accidental triggering of sitemap generation by users. + +**Example Cron Job:** + +To schedule the command to run at 2 AM, 10 AM, and 6 PM: + +```bash +0 2,10,18 * * * /usr/lib/ckan/default/bin/ckan -c /etc/ckan/default/ckan.ini ckanext-sitemap generate > /dev/null 2>&1 +``` + +## Available Commands + +- `generate` + + This command triggers the generation of the sitemap. + + Usage: + + ```bash + ckanext-sitemap generate + ``` + +## Contributing + +To contribute to this documentation, create a branch or fork this repository, make +your changes and create a merge request. + +## Versioning + +We use [SemVer](http://semver.org/) for versioning. For the versions available, see +the tags on this repository. ## License diff --git a/ckanext/sitemap/cli.py b/ckanext/sitemap/cli.py new file mode 100644 index 0000000..d1a01b3 --- /dev/null +++ b/ckanext/sitemap/cli.py @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- + +import click +import ckanext.sitemap.sitemap as sm + +def get_commands(): + return [ckanext_sitemap] + + +@click.group() +def ckanext_sitemap(): + """ckanext-sitemap + + Usage: + + ckanext-sitemap generate + - (Re)generate sitemap. + """ + + +@ckanext_sitemap.command() +def generate(): + """ + Command to generate sitemap. + """ + try: + click.echo('Starting sitemap generation..') + sm.generate_sitemap() + click.echo('Finished sitemap generation.') + + except Exception as e: + # Handle exceptions that may occur during cleanup + click.echo(f'Error during sitemap generation: {str(e)}', err=True) diff --git a/ckanext/sitemap/plugin.py b/ckanext/sitemap/plugin.py index 02f214f..84f8b10 100644 --- a/ckanext/sitemap/plugin.py +++ b/ckanext/sitemap/plugin.py @@ -1,11 +1,13 @@ import ckan.plugins as plugins import ckan.plugins.toolkit as toolkit import ckanext.sitemap.view as view +from ckanext.sitemap import cli class SitemapPlugin(plugins.SingletonPlugin): plugins.implements(plugins.IConfigurer) plugins.implements(plugins.IBlueprint) + plugins.implements(plugins.IClick) # IConfigurer def update_config(self, config_): @@ -16,6 +18,7 @@ def update_config(self, config_): # IBlueprint def get_blueprint(self): return view.get_blueprints() - - + # IClick + def get_commands(self): + return cli.get_commands() diff --git a/ckanext/sitemap/sitemap.py b/ckanext/sitemap/sitemap.py new file mode 100644 index 0000000..944bf2e --- /dev/null +++ b/ckanext/sitemap/sitemap.py @@ -0,0 +1,276 @@ +import logging +from datetime import datetime, timedelta +from typing import Set +import os +import sqlalchemy as sa + +from flask import make_response +from ckan.model import Session, Package, Group + +import ckan.plugins.toolkit as tk +from lxml import etree + +SITEMAP_NS = "http://www.sitemaps.org/schemas/sitemap/0.9" +XHTML_NS = "http://www.w3.org/1999/xhtml" + +SITEMAP_DIR = tk.config.get('ckanext.sitemap.directory', os.path.join( + os.path.dirname(__file__), 'public')) +SITEMAP_AUTORENEW = tk.asbool(tk.config.get( + 'ckanext.sitemap.autorenew', True)) +SITEMAP_TTL = int(tk.config.get('ckanext.sitemap.ttl', 8*3600)) +MAX_ITEMS = tk.config.get('ckanext.sitemap.max_items', 5000) +INC_RESOURCES = tk.asbool(tk.config.get( + 'ckanext.sitemap.resources', True)) +INC_GROUPS = tk.asbool(tk.config.get( + 'ckanext.sitemap.groups', True)) +INC_LANG_ALTS = tk.asbool(tk.config.get( + 'ckanext.sitemap.language_alternatives', True)) + +log = logging.getLogger(__name__) + + +def _get_locales_from_config() -> Set: + locales_offered = tk.config.get('ckan.locales_offered') + filtered_out = tk.config.get('ckan.locales_filtered_out') + locale_default = [tk.config.get('ckan.locale_default', 'en')] + + all_locales = set(locales_offered) + all_locales -= (set(filtered_out) | set(locale_default)) + return all_locales + + +def _create_language_alternatives(link, url): + ckan_site_url = tk.config.get("ckan.site_url") + for lang in _get_locales_from_config(): + # Check if the link already starts with the CKAN site URL + if link.startswith(ckan_site_url): + href = f"{ckan_site_url}/{lang}{link[len(ckan_site_url):]}" + else: + href = f"{ckan_site_url}/{lang}{link}" + + attrib = { + "rel": "alternate", + "hreflang": lang, + "href": href, + } + etree.SubElement(url, "{http://www.w3.org/1999/xhtml}link", attrib) + + +def _generate_filename(index): + return f"sitemap-{index}.xml" + + +def _generate_index_filename(): + return "sitemap_index.xml" + + +def _remove_file(file): + log.info("Removing sitemap file: %s", file) + os.remove(os.path.join(SITEMAP_DIR, file)) + + +def _add_url_to_sitemap(file_root, url, lastmod, uri, inc_lang_alts=True): + + ckan_site_url = tk.config.get("ckan.site_url") + + # Check if the link already starts with the CKAN site URL + if url.startswith(ckan_site_url): + # If the link already starts with the CKAN site URL, use it as is + uri = url + else: + # If not, append the URL to the CKAN site URL + uri = f"{ckan_site_url.rstrip('/')}/{url.lstrip('/')}" + + # Create URL element for each URI + url_elem = etree.SubElement(file_root, "url") + loc = etree.SubElement(url_elem, "loc") + loc.text = uri + lastmod_elem = etree.SubElement(url_elem, "lastmod") + lastmod_elem.text = lastmod.strftime("%Y-%m-%d") + + # Add language alternatives if needed + if INC_LANG_ALTS and inc_lang_alts: + _create_language_alternatives(uri, url_elem) + + return url_elem + + +def _start_new_sitemap(file_root, sitemap_item_count, sitemap_index): + if file_root is not None: + with open(os.path.join(SITEMAP_DIR, _generate_filename(sitemap_index)), "wb") as f: + f.write(etree.tostring(file_root, pretty_print=True)) + sitemap_index += 1 + file_root = etree.Element( + "urlset", nsmap={None: SITEMAP_NS, "xhtml": XHTML_NS}) + sitemap_item_count = 0 + return file_root, sitemap_index, sitemap_item_count + + +def _generate_sitemap_files(): + sitemap_item_count = 0 + sitemap_index = 0 + file_root = None + + # Generate sitemap entries for default CKAN URIs + ckan_uris = [ + tk.url_for(controller="home", action="index", _external=True), + tk.url_for(controller="dataset", action="search", _external=True), + tk.url_for(controller="organization", action="index", _external=True), + tk.url_for(controller="group", action="index", _external=True), + ] + + lastmod = datetime.now() + + for uri in ckan_uris: + if sitemap_item_count % MAX_ITEMS == 0: + file_root, sitemap_index, sitemap_item_count = _start_new_sitemap( + file_root, sitemap_item_count, sitemap_index + ) + _add_url_to_sitemap(file_root, uri, lastmod, uri) + sitemap_item_count += 1 + + # Get additional URIs from the CKAN configuration (if present) + custom_uris = tk.config.get('ckanext.sitemap.custom_uris', '').split() + + for uri in custom_uris: + if sitemap_item_count % MAX_ITEMS == 0: + file_root, sitemap_index, sitemap_item_count = _start_new_sitemap( + file_root, sitemap_item_count, sitemap_index + ) + _add_url_to_sitemap(file_root, uri, lastmod, uri, False) + sitemap_item_count += 1 + + # Generate sitemap entries for packages + pkgs = ( + Session.query(Package) + .filter(Package.type == "dataset") + .filter(Package.private != True) + .filter(Package.state == "active") + .all() + ) + for pkg in pkgs: + if sitemap_item_count % MAX_ITEMS == 0: + file_root, sitemap_index, sitemap_item_count = _start_new_sitemap( + file_root, sitemap_item_count, sitemap_index + ) + pkg_url = tk.url_for(controller="dataset", action="read", id=pkg.name) + _add_url_to_sitemap(file_root, pkg_url, pkg.metadata_modified, + tk.config.get("ckan.site_url") + pkg_url) + sitemap_item_count += 1 + + # Generate sitemap entries for resources (if enabled) + if INC_RESOURCES: + for res in pkg.resources: + if sitemap_item_count % MAX_ITEMS == 0: + file_root, sitemap_index, sitemap_item_count = _start_new_sitemap( + file_root, sitemap_item_count, sitemap_index + ) + res_url = tk.url_for(controller="dataset_resource", action="read", id=pkg.name, + package_type=tk.h.default_package_type(), resource_id=res.id) + _add_url_to_sitemap(file_root, res_url, res.created, tk.config.get( + "ckan.site_url") + res_url) + sitemap_item_count += 1 + + # Generate sitemap entries for organizations and groups (if enabled) + if INC_GROUPS: + groups = ( + Session.query(Group) + .filter(Group.state == "active") + .filter(sa.or_(Group.type == "organization", Group.type == "group")) + .all() + ) + + for group in groups: + group_url = None + if group.type == "organization": + group_url = tk.url_for(controller="organization", action="read", id=group.name) + elif group.type == "group": + group_url = tk.url_for(controller="group", action="read", id=group.name) + + if group_url: + if sitemap_item_count % MAX_ITEMS == 0: + file_root, sitemap_index, sitemap_item_count = _start_new_sitemap( + file_root, sitemap_item_count, sitemap_index + ) + _add_url_to_sitemap( + file_root, group_url, lastmod, tk.config.get("ckan.site_url") + group_url + ) + sitemap_item_count += 1 + + + # Write the last sitemap file + if file_root is not None: + with open(os.path.join(SITEMAP_DIR, _generate_filename(sitemap_index)), "wb") as f: + f.write(etree.tostring(file_root, pretty_print=True)) + + return sitemap_index + 1 + + +def generate_sitemap(): + try: + log.info("Generating sitemaps") + + total_sitemaps = _generate_sitemap_files() + + # Generate sitemap index + index_root = etree.Element( + "sitemapindex", nsmap={None: SITEMAP_NS}) + for i in range(total_sitemaps): # Include all generated sitemaps + index_url = etree.SubElement(index_root, "sitemap") + loc = etree.SubElement(index_url, "loc") + + # Add the entry for the sitemap file + sitemap_file_name = _generate_filename(i) + sitemap_url = tk.config.get( + "ckan.site_url") + "/" + sitemap_file_name + + loc.text = sitemap_url + + with open(os.path.join(SITEMAP_DIR, _generate_index_filename()), "wb") as f: + f.write(etree.tostring(index_root, pretty_print=True)) + + except Exception as e: + log.exception("Error occurred during sitemap generation: %s", e) + raise + + +def generate_sitemap_response(index=None): + # Check modification time of sitemap index file + index_file_path = os.path.join( + SITEMAP_DIR, _generate_index_filename()) + if os.path.exists(index_file_path): + index_mtime = os.path.getmtime(index_file_path) + if datetime.fromtimestamp(index_mtime) < (datetime.now() - timedelta(seconds=SITEMAP_TTL)) and SITEMAP_AUTORENEW: + # Regenerate sitemap index file if older than TTL and autorenew is enabled + log.info("Regenerating sitemap index file (older than TTL)") + _remove_file(_generate_index_filename()) + generate_sitemap() + else: + # Generate sitemap index file if it doesn't exist + log.info("Generating sitemap index file (not present)") + generate_sitemap() + + if index is None: + requested_file = _generate_index_filename() + return create_response(requested_file) + else: + sitemap_files = [file for file in os.listdir( + SITEMAP_DIR) if file.startswith("sitemap-")] + + if not sitemap_files: + generate_sitemap() + sitemap_files = [file for file in os.listdir( + SITEMAP_DIR) if file.startswith("sitemap-")] + + requested_file = _generate_filename(index) + if requested_file in sitemap_files: + return create_response(requested_file) + else: + return make_response("Not Found", 404) + + +def create_response(file): + with open(os.path.join(SITEMAP_DIR, file), "rb") as f: + response = make_response(f.read(), 200) + response.headers["Content-Type"] = "application/xml" + return response diff --git a/ckanext/sitemap/view.py b/ckanext/sitemap/view.py index bf436a1..20316fc 100644 --- a/ckanext/sitemap/view.py +++ b/ckanext/sitemap/view.py @@ -1,144 +1,44 @@ import logging -from datetime import datetime, timezone import os from flask import Blueprint, make_response -from ckan.model import Session, Package import ckan.plugins.toolkit as tk -from lxml import etree +import ckanext.sitemap.sitemap as sm sitemap = Blueprint("sitemap", __name__) -# cache = Cache(current_app, config={'CACHE_TYPE': 'simple'}) +SITEMAP_DIR = tk.config.get('ckanext.sitemap.directory', os.path.join( + os.path.dirname(__file__), 'public')) +SITEMAP_AUTORENEW = tk.asbool(tk.config.get( + 'ckanext.sitemap.autorenew', True)) -SITEMAP_NS = "http://www.sitemaps.org/schemas/sitemap/0.9" +log = logging.getLogger(__name__) -XHTML_NS = "http://www.w3.org/1999/xhtml" -log = logging.getLogger(__file__) +def view(index=None): + try: + return sm.generate_sitemap_response(index) -defualt_locals = tk.config.get("ckan.locale_default", "en") + except Exception as e: + log.exception( + "Error occurred during sitemap response generation: %s", e) + return make_response("Internal Server Error", 500) -if isinstance(defualt_locals, str): - defualt_locals = [defualt_locals] +def redirect_to_sitemap_index(): + return tk.redirect_to("/sitemap_index.xml") -def _create_language_alternatives(link, url): - """ - Create links (elements) for every language in locales_offered in .ini file - :param link: string containing the link, eg. /dataset/xyz - :param url: root node - """ - for lang in defualt_locals: - attrib = { - "rel": "alternate", - "hreflang": lang, - "href": tk.config.get("ckan.site_url") + "/" + lang + link, - } - etree.SubElement(url, "{http://www.w3.org/1999/xhtml}link", attrib) +# Don't add url rules if autorenew is set to false and sitemaps stored in public dir +if SITEMAP_AUTORENEW or SITEMAP_DIR != os.path.join(os.path.dirname(__file__), 'public'): + sitemap.add_url_rule("/sitemap_index.xml", + view_func=view, methods=["GET"]) + sitemap.add_url_rule("/sitemap-.xml", + view_func=view, methods=["GET"]) -def sitemap_controller(): - root = etree.Element("urlset", nsmap={None: SITEMAP_NS, "xhtml": XHTML_NS}) - - current_dir = os.path.dirname(__file__) - format_string = "%Y-%m-%dT%H:%M:%S.%f%z" - - def _generate_filename(): - return "sitemap-" + datetime.now(tz=timezone.utc).isoformat() + ".xml" - - def _remove_file(file): - log.info("Removing sitemap.xml file: %s", file) - os.remove(os.path.join(current_dir, file)) - - def _create_file(filename, root): - log.info("Creating new sitemap.xml file: %s", filename) - pkgs = ( - Session.query(Package) - .filter(Package.type == "dataset") - .filter(Package.private != True) - .filter(Package.state == "active") - .all() - ) - - all_ckan_urls = [ - tk.url_for(controller="home", action="index", _external=True), - tk.url_for(controller="dataset", action="search", _external=True), - tk.url_for(controller="organization", action="index", _external=True), - tk.url_for(controller="group", action="index", _external=True), - ] - - for _url in all_ckan_urls: - url = etree.SubElement(root, "url") - loc = etree.SubElement(url, "loc") - loc.text = _url - _create_language_alternatives(_url, url) - - for pkg in pkgs: - url = etree.SubElement(root, "url") - loc = etree.SubElement(url, "loc") - pkg_url = tk.url_for(controller="dataset", action="read", id=pkg.name) - loc.text = tk.config.get("ckan.site_url") + pkg_url - lastmod = etree.SubElement(url, "lastmod") - lastmod.text = pkg.metadata_modified.strftime("%Y-%m-%d") - _create_language_alternatives(pkg_url, url) - for res in pkg.resources: - url = etree.SubElement(root, "url") - loc = etree.SubElement(url, "loc") - loc.text = tk.config.get("ckan.site_url") + tk.url_for( - controller="dataset_resource", - action="read", - id=pkg.name, - package_type=tk.h.default_package_type(), - resource_id=res.id, - ) - lastmod = etree.SubElement(url, "lastmod") - _create_language_alternatives( - tk.url_for( - controller="dataset_resource", - action="read", - id=pkg.name, - package_type=tk.h.default_package_type(), - resource_id=res.id, - ), - url, - ) - lastmod.text = res.created.strftime("%Y-%m-%d") - - with open(os.path.join(current_dir, filename), "wb") as f: - f.write(etree.tostring(root, pretty_print=True)) - - def create_response(file): - with open(os.path.join(current_dir, file), "rb") as f: - response = make_response(f.read(), 200) - response.headers["Content-Type"] = "application/xml" - return response - - sitemap_file = [ - file for file in os.listdir(current_dir) if file.startswith("sitemap-") - ] - - if not sitemap_file: - _create_file(_generate_filename(), root) - else: - file_date = sitemap_file[0].replace("sitemap-", "").replace(".xml", "") - now = datetime.now(timezone.utc) - file_date = datetime.strptime(file_date, format_string).astimezone(timezone.utc) - time_difference = now - file_date - - if time_difference.total_seconds() > 8 * 3600: - _remove_file(sitemap_file[0]) - _create_file(_generate_filename(), root) - else: - response = create_response(sitemap_file[0]) - return response - - response = make_response(etree.tostring(root, pretty_print=True), 200) - return response - - -sitemap.add_url_rule("/sitemap.xml", view_func=sitemap_controller, methods=["GET"]) +sitemap.add_url_rule("/sitemap.xml", + view_func=redirect_to_sitemap_index, methods=["GET"]) def get_blueprints():