feat: ⚡️ add Spain IPC dataset

datonic · Apr 1, 2024 · 6e0bd58 · 6e0bd58
1 parent 6a9b4b8
commit 6e0bd58
Show file tree

Hide file tree

Showing 4 changed files with 58 additions and 21 deletions.
diff --git a/datadex/assets/huggingface.py b/datadex/assets/huggingface.py
@@ -20,6 +20,7 @@ def hf_asset(data: pd.DataFrame, hf: HuggingFaceResource) -> None:
     "wikidata_asteroids",
     "threatened_animal_species",
     "country_year_indicators",
+    "spain_ipc",
 ]
 
 assets = []

diff --git a/datadex/assets/indicators.py b/datadex/assets/indicators.py
@@ -4,26 +4,7 @@
 import pandas as pd
 import requests
 from dagster import asset
-
-
-def sanitize_string(s: str) -> str:
-    """
-    Sanitize a string to be used as a column name in a pandas DataFrame.
-    """
-
-    return (
-        s.lower()
-        .replace(" ", "_")
-        .replace("(", "")
-        .replace(")", "")
-        .replace("-", "_")
-        .replace(",", "")
-        .replace(":", "")
-        .replace("'", "")
-        .replace("$", "dollar")
-        .replace("%", "percent")
-        .replace("+", "plus")
-    )
+from slugify import slugify
 
 
 @asset
@@ -86,6 +67,6 @@ def world_bank_wdi() -> pd.DataFrame:
     ).reset_index()
 
     # Clean column names
-    pivoted_data.columns = [sanitize_string(col) for col in pivoted_data.columns]
+    pivoted_data.columns = [slugify(col, separator="_") for col in pivoted_data.columns]
 
     return pivoted_data
diff --git a/datadex/assets/others.py b/datadex/assets/others.py
@@ -3,6 +3,7 @@
 import pandas as pd
 import requests
 from dagster import AssetExecutionContext, asset
+from slugify import slugify
 
 from ..resources import IUCNRedListAPI
 
@@ -92,3 +93,27 @@ def spain_energy_demand(context: AssetExecutionContext) -> pd.DataFrame:
         end_date_str = end_date.strftime("%Y-%m-%d")
 
     return df
+
+
+@asset
+def spain_ipc() -> pd.DataFrame:
+    """
+    Spain IPC data from INE. Downloaded from datos.gob.es (https://datos.gob.es/es/apidata).
+    """
+
+    df = pd.read_csv("https://www.ine.es/jaxiT3/files/t/csv_bdsc/50904.csv", sep=";")
+
+    # Clean data
+    df["Total"] = pd.to_numeric(df["Total"].str.replace(",", "."), errors="coerce")
+    df["Periodo"] = pd.to_datetime(df["Periodo"].str.replace("M", "-"), format="%Y-%m")
+
+    df = df.pivot_table(
+        index=["Periodo", "Clases"],
+        columns=["Tipo de dato"],
+        values="Total",
+        aggfunc="sum",
+    ).reset_index()
+
+    df.columns = [slugify(col, separator="_") for col in df.columns]
+
+    return df
diff --git a/datadex/resources.py b/datadex/resources.py
@@ -26,3 +26,33 @@ def get_species(self, page):
         r.raise_for_status()
 
         return r.json()["result"]
+
+
+class REDataAPI(ConfigurableResource):
+    endpoint: str = "https://apidatos.ree.es/en/datos"
+    first_day: str = "2014-01-01"
+
+    def query(
+        self,
+        category: str,
+        widget: str,
+        start_date: str,
+        end_date: str,
+        time_trunc: str,
+    ):
+        params = f"start_date={start_date}T00:00&end_date={end_date}T00:00&time_trunc={time_trunc}"
+        url = f"{self.endpoint}/{category}/{widget}?{params}"
+        r = requests.get(url)
+        r.raise_for_status()
+
+        return r.json()
+
+    def get_energy_demand(self, start_date: str, end_date: str, time_trunc="hour"):
+        category = "demanda"
+        widget = "demanda-tiempo-real"
+        return self.query(category, widget, start_date, end_date, time_trunc)
+
+    def get_market_prices(self, start_date: str, end_date: str, time_trunc="hour"):
+        category = "mercados"
+        widget = "precios-mercados-tiempo-real"
+        return self.query(category, widget, start_date, end_date, time_trunc)