diff --git a/datadex/assets/huggingface.py b/datadex/assets/huggingface.py index 27c83b8..0f226b0 100644 --- a/datadex/assets/huggingface.py +++ b/datadex/assets/huggingface.py @@ -20,6 +20,7 @@ def hf_asset(data: pd.DataFrame, hf: HuggingFaceResource) -> None: "wikidata_asteroids", "threatened_animal_species", "country_year_indicators", + "spain_ipc", ] assets = [] diff --git a/datadex/assets/indicators.py b/datadex/assets/indicators.py index e422b74..509df78 100644 --- a/datadex/assets/indicators.py +++ b/datadex/assets/indicators.py @@ -4,26 +4,7 @@ import pandas as pd import requests from dagster import asset - - -def sanitize_string(s: str) -> str: - """ - Sanitize a string to be used as a column name in a pandas DataFrame. - """ - - return ( - s.lower() - .replace(" ", "_") - .replace("(", "") - .replace(")", "") - .replace("-", "_") - .replace(",", "") - .replace(":", "") - .replace("'", "") - .replace("$", "dollar") - .replace("%", "percent") - .replace("+", "plus") - ) +from slugify import slugify @asset @@ -86,6 +67,6 @@ def world_bank_wdi() -> pd.DataFrame: ).reset_index() # Clean column names - pivoted_data.columns = [sanitize_string(col) for col in pivoted_data.columns] + pivoted_data.columns = [slugify(col, separator="_") for col in pivoted_data.columns] return pivoted_data diff --git a/datadex/assets/others.py b/datadex/assets/others.py index 668a4d9..090e4db 100644 --- a/datadex/assets/others.py +++ b/datadex/assets/others.py @@ -3,6 +3,7 @@ import pandas as pd import requests from dagster import AssetExecutionContext, asset +from slugify import slugify from ..resources import IUCNRedListAPI @@ -92,3 +93,27 @@ def spain_energy_demand(context: AssetExecutionContext) -> pd.DataFrame: end_date_str = end_date.strftime("%Y-%m-%d") return df + + +@asset +def spain_ipc() -> pd.DataFrame: + """ + Spain IPC data from INE. Downloaded from datos.gob.es (https://datos.gob.es/es/apidata). + """ + + df = pd.read_csv("https://www.ine.es/jaxiT3/files/t/csv_bdsc/50904.csv", sep=";") + + # Clean data + df["Total"] = pd.to_numeric(df["Total"].str.replace(",", "."), errors="coerce") + df["Periodo"] = pd.to_datetime(df["Periodo"].str.replace("M", "-"), format="%Y-%m") + + df = df.pivot_table( + index=["Periodo", "Clases"], + columns=["Tipo de dato"], + values="Total", + aggfunc="sum", + ).reset_index() + + df.columns = [slugify(col, separator="_") for col in df.columns] + + return df diff --git a/datadex/resources.py b/datadex/resources.py index 9e489e8..2d15d0f 100644 --- a/datadex/resources.py +++ b/datadex/resources.py @@ -26,3 +26,33 @@ def get_species(self, page): r.raise_for_status() return r.json()["result"] + + +class REDataAPI(ConfigurableResource): + endpoint: str = "https://apidatos.ree.es/en/datos" + first_day: str = "2014-01-01" + + def query( + self, + category: str, + widget: str, + start_date: str, + end_date: str, + time_trunc: str, + ): + params = f"start_date={start_date}T00:00&end_date={end_date}T00:00&time_trunc={time_trunc}" + url = f"{self.endpoint}/{category}/{widget}?{params}" + r = requests.get(url) + r.raise_for_status() + + return r.json() + + def get_energy_demand(self, start_date: str, end_date: str, time_trunc="hour"): + category = "demanda" + widget = "demanda-tiempo-real" + return self.query(category, widget, start_date, end_date, time_trunc) + + def get_market_prices(self, start_date: str, end_date: str, time_trunc="hour"): + category = "mercados" + widget = "precios-mercados-tiempo-real" + return self.query(category, widget, start_date, end_date, time_trunc)