From 6a5316440510ab729c555d4c1ae43fd89f6b0637 Mon Sep 17 00:00:00 2001 From: Ian Eyre Date: Sun, 19 Nov 2023 16:18:57 +0000 Subject: [PATCH 1/4] Updated Jupyter Notebook --- python-for-data-analysis/README.md | 15 + python-for-data-analysis/Solution-New.ipynb | 782 ++++++++++++++++++ python-for-data-analysis/james_bond_data.csv | 28 + python-for-data-analysis/james_bond_data.json | 1 + .../james_bond_data.parquet | Bin 0 -> 11050 bytes python-for-data-analysis/james_bond_data.xlsx | Bin 0 -> 11269 bytes .../james_bond_data_cleansed.csv | 26 + 7 files changed, 852 insertions(+) create mode 100644 python-for-data-analysis/README.md create mode 100644 python-for-data-analysis/Solution-New.ipynb create mode 100644 python-for-data-analysis/james_bond_data.csv create mode 100644 python-for-data-analysis/james_bond_data.json create mode 100644 python-for-data-analysis/james_bond_data.parquet create mode 100644 python-for-data-analysis/james_bond_data.xlsx create mode 100644 python-for-data-analysis/james_bond_data_cleansed.csv diff --git a/python-for-data-analysis/README.md b/python-for-data-analysis/README.md new file mode 100644 index 0000000000..8cb7b23c7f --- /dev/null +++ b/python-for-data-analysis/README.md @@ -0,0 +1,15 @@ +# Downloadable Files + +This folder contains completed notebooks and other files used in the Real Python tutorial on [Using Python for Data Analysis](https://realpython.com/using-python-for-data-analysis/). + +The `james_bond_data.csv` file contains the original uncleansed data and is the only mandatory file you will need to complete the tutorial. The same data is also available in JSON, Parquet and Excel versions to allow you to complete the optional exercises in reading from these file types. + +A cleansed version of the original data is available in the `james_bond_data_cleansed.csv` file. + +The complete code is available in the `Solution-New.ipynb` Jupyter notebook. + +## Setup + +The easiest way to work through this tutorial is to install and use [JupyterLab](https://realpython.com/using-jupyterlab/). Using Jupyter Notebook within JupyterLab will allow you to run code and see its results cleanly, and in the same way they are presented in the tutorial. It will also make it easy for you to view the supporting files. + + diff --git a/python-for-data-analysis/Solution-New.ipynb b/python-for-data-analysis/Solution-New.ipynb new file mode 100644 index 0000000000..6a361bcbc1 --- /dev/null +++ b/python-for-data-analysis/Solution-New.ipynb @@ -0,0 +1,782 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ade4bd3f-543b-460b-980f-0b41aab2c8b6", + "metadata": {}, + "source": [ + "# Acquiring Your Data" + ] + }, + { + "cell_type": "markdown", + "id": "83ad2114-5ed8-4a90-85fa-adea5eda4392", + "metadata": {}, + "source": [ + "## Reading Data From CSV Files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a360772e-7829-4c15-9af9-d4596efc7351", + "metadata": {}, + "outputs": [], + "source": [ + "! python -m pip install pandas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e52f486-232e-440b-8585-90416e4300c2", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "james_bond_data = pd.read_csv(\"james_bond_data.csv\").convert_dtypes()\n", + "james_bond_data.head()" + ] + }, + { + "cell_type": "markdown", + "id": "e47c1f9b-b390-4035-956b-622615b57f32", + "metadata": {}, + "source": [ + "## Reading Data From Other Sources" + ] + }, + { + "cell_type": "markdown", + "id": "1d85aee9-cfeb-460b-9fe8-f3c7e7dfb764", + "metadata": {}, + "source": [ + "### Reading JSON" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7465cd11-dad4-4741-9372-f825b28c33d6", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "james_bond_data_json = pd.read_json(\"james_bond_data.json\").convert_dtypes()\n", + "james_bond_data_json.head()" + ] + }, + { + "cell_type": "markdown", + "id": "47a0e4a6-0ed9-4253-9833-0ad22c49b968", + "metadata": {}, + "source": [ + "### Reading Excel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0364b81-64a0-4098-89fc-e58bd6d68257", + "metadata": {}, + "outputs": [], + "source": [ + "! python -m pip install openpyxl" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8302139f-52dc-4f95-aa9a-96040ae5d82b", + "metadata": {}, + "outputs": [], + "source": [ + "import openpyxl\n", + "import pandas as pd\n", + "\n", + "james_bond_data_excel = pd.read_excel(\"james_bond_data.xlsx\").convert_dtypes()\n", + "james_bond_data_excel.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b3237b8-99a0-4070-81a9-e7f9e44c8973", + "metadata": {}, + "outputs": [], + "source": [ + "! python.exe -m pip install --upgrade pip" + ] + }, + { + "cell_type": "markdown", + "id": "be4a1143-c966-4056-8a5e-3bdebe2a9b1f", + "metadata": {}, + "source": [ + "### Reading Parquet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f36ef600-e6ba-4cc6-9ee3-0cbf369a4be2", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install pyarrow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c86284a2-9073-4240-b4d5-5e8b0373fc27", + "metadata": {}, + "outputs": [], + "source": [ + "james_bond_data_parquet = pd.read_parquet(\n", + " \"james_bond_data.parquet\"\n", + ").convert_dtypes()\n", + "james_bond_data_parquet.head()" + ] + }, + { + "cell_type": "markdown", + "id": "69f884c2-92e8-4db3-bd63-84007f654808", + "metadata": {}, + "source": [ + "### Scraping HTML" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b902722d-9648-4124-80b0-64004342170d", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install lxml" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1fb2ff9c-3030-4f4a-be30-c2ab68452a21", + "metadata": {}, + "outputs": [], + "source": [ + "james_bond_data_html = pd.read_html(\n", + " \"https://en.wikipedia.org/wiki/List_of_James_Bond_novels_and_short_stories\"\n", + ")\n", + "james_bond_data_html = james_bond_data_html[1].convert_dtypes()\n", + "james_bond_data_html.head()" + ] + }, + { + "cell_type": "markdown", + "id": "31068de2-9864-434a-9652-b115d1131684", + "metadata": {}, + "source": [ + "# Cleansing Your Data With Python" + ] + }, + { + "cell_type": "markdown", + "id": "e432b28e-257b-422b-b2f8-06f41608391b", + "metadata": {}, + "source": [ + "## Dealing With Missing Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38eb1abb-9f89-4a53-9e77-f7c71dbeff18", + "metadata": {}, + "outputs": [], + "source": [ + "james_bond_data.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b29d5a34-c930-4ce2-898c-b9e8aa7f771d", + "metadata": {}, + "outputs": [], + "source": [ + "james_bond_data[james_bond_data.isna().any(axis=\"columns\")]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1db9201a-11c1-4cdd-9625-d70cee736191", + "metadata": {}, + "outputs": [], + "source": [ + "data = james_bond_data.combine_first(\n", + " pd.DataFrame({\"Avg_User_IMDB\": {10: 7.1}, \"Avg_User_Rtn_Tom\": {10: 6.8}})\n", + ")\n", + "\n", + "data" + ] + }, + { + "cell_type": "markdown", + "id": "f6297c81-4c63-4eff-95e3-4a944bb5fe03", + "metadata": {}, + "source": [ + "## Correcting Invalid Data Types" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "001996e3-2fce-4228-a873-b78eef613bba", + "metadata": {}, + "outputs": [], + "source": [ + "data[[\"US_Gross\", \"World_Gross\", \"Budget ($ 000s)\", \"Film_Length\"]].head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "880e4710-1c11-4de2-a2c3-97a9672ce6f7", + "metadata": {}, + "outputs": [], + "source": [ + "data = james_bond_data.combine_first(\n", + " pd.DataFrame({\"Avg_User_IMDB\": {10: 7.1}, \"Avg_User_Rtn_Tom\": {10: 6.8}})\n", + ").assign(\n", + " US_Gross=lambda data: (\n", + " data[\"US_Gross\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " World_Gross=lambda data: (\n", + " data[\"World_Gross\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " Budget=lambda data: (\n", + " data[\"Budget ($ 000s)\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a542149d-35d1-4012-8638-25e59f2f3ae4", + "metadata": {}, + "outputs": [], + "source": [ + "data[[\"US_Gross\", \"World_Gross\", \"Budget\"]].head()\n", + "data[[\"US_Gross\", \"World_Gross\", \"Budget\"]].info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae9c1d1b-a620-43c5-a199-eb6a7bff7ce2", + "metadata": {}, + "outputs": [], + "source": [ + "data = james_bond_data.combine_first(\n", + " pd.DataFrame({\"Avg_User_IMDB\": {10: 7.1}, \"Avg_User_Rtn_Tom\": {10: 6.8}})\n", + ").assign(\n", + " US_Gross=lambda data: (\n", + " data[\"US_Gross\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " World_Gross=lambda data: (\n", + " data[\"World_Gross\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " Budget=lambda data: (\n", + " data[\"Budget ($ 000s)\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " Film_Length=lambda data: (\n", + " data[\"Film_Length\"].str.rstrip(\"mins\").astype(int)\n", + " ),\n", + ")\n", + "\n", + "data[[\"Film_Length\"]].head()\n", + "data[[\"Film_Length\"]].info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed0ead0e-7310-4c82-86d5-2480a95f1525", + "metadata": {}, + "outputs": [], + "source": [ + "data = james_bond_data.combine_first(\n", + " pd.DataFrame({\"Avg_User_IMDB\": {10: 7.1}, \"Avg_User_Rtn_Tom\": {10: 6.8}})\n", + ").assign(\n", + " US_Gross=lambda data: (\n", + " data[\"US_Gross\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " World_Gross=lambda data: (\n", + " data[\"World_Gross\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " Budget=lambda data: (\n", + " data[\"Budget ($ 000s)\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " Film_Length=lambda data: (\n", + " data[\"Film_Length\"].str.rstrip(\"mins\").astype(int)\n", + " ),\n", + " Release=lambda data: pd.to_datetime(data[\"Release\"], format=\"%B, %Y\"),\n", + " Release_Year=lambda data: data[\"Release\"].dt.year,\n", + ")\n", + "\n", + "data[[\"Release\"]].info()\n", + "data[[\"Release_Year\"]].head()" + ] + }, + { + "cell_type": "markdown", + "id": "89653d81-3bcd-4078-83cb-ad4b2fa560e6", + "metadata": {}, + "source": [ + "## Fixing Inconsistencies in Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc483320-7895-4368-a672-b98f8d0c9755", + "metadata": {}, + "outputs": [], + "source": [ + "data = james_bond_data.combine_first(\n", + " pd.DataFrame({\"Avg_User_IMDB\": {10: 7.1}, \"Avg_User_Rtn_Tom\": {10: 6.8}})\n", + ").assign(\n", + " US_Gross=lambda data: (\n", + " data[\"US_Gross\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " World_Gross=lambda data: (\n", + " data[\"World_Gross\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " Budget=lambda data: (\n", + " data[\"Budget ($ 000s)\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " * 1000\n", + " ),\n", + " Film_Length=lambda data: (\n", + " data[\"Film_Length\"].str.rstrip(\"mins\").astype(int)\n", + " ),\n", + " Release=lambda data: pd.to_datetime(data[\"Release\"], format=\"%B, %Y\"),\n", + " Release_Year=lambda data: data[\"Release\"].dt.year,\n", + ")\n", + "\n", + "data[[\"US_Gross\", \"World_Gross\", \"Budget\"]].head()" + ] + }, + { + "cell_type": "markdown", + "id": "3e129b32-5e66-41cb-b938-8fd58bb94116", + "metadata": {}, + "source": [ + "## Removing Duplicate Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be7aad8b-ef3f-48a6-a9a0-de909133921f", + "metadata": {}, + "outputs": [], + "source": [ + "data[\"Movie\"].value_counts().head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20067efb-e7c7-4690-b483-1d29847ad24f", + "metadata": {}, + "outputs": [], + "source": [ + "duplicate_movies = [\"The Man with the Golden Gun\", \"The Living Daylights\"]\n", + "data[data[\"Movie\"].isin(duplicate_movies)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c98c7640-1472-4869-9fdd-f070d665ae1d", + "metadata": {}, + "outputs": [], + "source": [ + "data = james_bond_data.combine_first(\n", + " pd.DataFrame({\"Avg_User_IMDB\": {10: 7.1}, \"Avg_User_Rtn_Tom\": {10: 6.8}})\n", + ").assign(\n", + " US_Gross=lambda data: (\n", + " data[\"US_Gross\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " World_Gross=lambda data: (\n", + " data[\"World_Gross\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " Budget=lambda data: (\n", + " data[\"Budget ($ 000s)\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " Film_Length=lambda data: (data[\"Film_Length\"].str.rstrip(\"mins\").astype(int)),\n", + " Release=lambda data: pd.to_datetime(data[\"Release\"], format=\"%B, %Y\"),\n", + " Release_Year=lambda data: data[\"Release\"].dt.year\n", + ").drop_duplicates(ignore_index=True)\n", + "\n", + "duplicate_movies = [\"The Man with the Golden Gun\", \"The Living Daylights\"]\n", + "data[data[\"Movie\"].isin(duplicate_movies)]" + ] + }, + { + "cell_type": "markdown", + "id": "8bdaa8b1-9f2e-46a5-b53a-c1ae4c201c99", + "metadata": {}, + "source": [ + "## Correcting Spelling Errors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e442e51a-28fd-42d7-94b0-aaf1abe5d9a8", + "metadata": {}, + "outputs": [], + "source": [ + "data[\"Bond\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f9863aa7-b5db-4ab1-be63-727ff437b63b", + "metadata": {}, + "outputs": [], + "source": [ + "data = (\n", + " james_bond_data.combine_first(\n", + " pd.DataFrame(\n", + " {\"Avg_User_IMDB\": {10: 7.1}, \"Avg_User_Rtn_Tom\": {10: 6.8}}\n", + " )\n", + " )\n", + " .assign(\n", + " US_Gross=lambda data: (\n", + " data[\"US_Gross\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " World_Gross=lambda data: (\n", + " data[\"World_Gross\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " Budget=lambda data: (\n", + " data[\"Budget ($ 000s)\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " Film_Length=lambda data: (\n", + " data[\"Film_Length\"].str.rstrip(\"mins\").astype(int)\n", + " ),\n", + " Release=lambda data: pd.to_datetime(data[\"Release\"], format=\"%B, %Y\"),\n", + " Release_Year=lambda data: data[\"Release\"].dt.year,\n", + " Bond=lambda data: (\n", + " data[\"Bond\"]\n", + " .str.replace(\"Shawn\", \"Sean\")\n", + " .str.replace(\"MOORE\", \"Moore\")\n", + " ),\n", + " )\n", + " .drop_duplicates(ignore_index=True)\n", + ")\n", + "\n", + "data[\"Bond\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a26b138d-72e5-4e15-a875-ee65023545d1", + "metadata": {}, + "outputs": [], + "source": [ + "james_bond_data[\"Bond_Car_MFG\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd4ae142-e339-4601-b0a4-84375eb28c02", + "metadata": {}, + "outputs": [], + "source": [ + "data = (\n", + " james_bond_data.combine_first(\n", + " pd.DataFrame(\n", + " {\"Avg_User_IMDB\": {10: 7.1}, \"Avg_User_Rtn_Tom\": {10: 6.8}}\n", + " )\n", + " )\n", + " .assign(\n", + " US_Gross=lambda data: (\n", + " data[\"US_Gross\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " World_Gross=lambda data: (\n", + " data[\"World_Gross\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " Budget=lambda data: (\n", + " data[\"Budget ($ 000s)\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " Film_Length=lambda data: (\n", + " data[\"Film_Length\"].str.rstrip(\"mins\").astype(int)\n", + " ),\n", + " Release=lambda data: pd.to_datetime(data[\"Release\"], format=\"%B, %Y\"),\n", + " Release_Year=lambda data: data[\"Release\"].dt.year,\n", + " Bond=lambda data: (\n", + " data[\"Bond\"]\n", + " .str.replace(\"Shawn\", \"Sean\")\n", + " .str.replace(\"MOORE\", \"Moore\")\n", + " ),\n", + " Bond_Car_MFG=lambda data: data[\"Bond_Car_MFG\"].str.replace(\n", + " \"Astin\", \"Aston\"\n", + " ),\n", + " )\n", + " .drop_duplicates(ignore_index=True)\n", + ")\n", + "\n", + "data[\"Bond_Car_MFG\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "50c80bc8-fdb9-4c28-af5a-cd6b66c7a01d", + "metadata": {}, + "source": [ + "## Checking For Invalid Outliers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8260f6b1-6d7f-4338-95b7-8946d69a92e2", + "metadata": {}, + "outputs": [], + "source": [ + "data[[\"Film_Length\", \"Martinis\"]].describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c73fe06b-5f42-4357-9b0f-2e460bf0dacf", + "metadata": {}, + "outputs": [], + "source": [ + "data = (\n", + " james_bond_data.combine_first(\n", + " pd.DataFrame(\n", + " {\"Avg_User_IMDB\": {10: 7.1}, \"Avg_User_Rtn_Tom\": {10: 6.8}}\n", + " )\n", + " )\n", + " .assign(\n", + " US_Gross=lambda data: (\n", + " data[\"US_Gross\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " World_Gross=lambda data: (\n", + " data[\"World_Gross\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " Budget=lambda data: (\n", + " data[\"Budget ($ 000s)\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " Film_Length=lambda data: (\n", + " data[\"Film_Length\"]\n", + " .str.rstrip(\"mins\")\n", + " .astype(int)\n", + " .replace(1200, 120)\n", + " ),\n", + " Release=lambda data: pd.to_datetime(data[\"Release\"], format=\"%B, %Y\"),\n", + " Release_Year=lambda data: data[\"Release\"].dt.year,\n", + " Bond=lambda data: (\n", + " data[\"Bond\"]\n", + " .str.replace(\"Shawn\", \"Sean\")\n", + " .str.replace(\"MOORE\", \"Moore\")\n", + " ),\n", + " Bond_Car_MFG=lambda data: data[\"Bond_Car_MFG\"].str.replace(\n", + " \"Astin\", \"Aston\"\n", + " ),\n", + " Martinis=lambda data: data[\"Martinis\"].replace(-6, 6),\n", + " )\n", + " .drop_duplicates(ignore_index=True)\n", + ")\n", + "\n", + "data[[\"Film_Length\", \"Martinis\"]].describe()" + ] + }, + { + "cell_type": "markdown", + "id": "52db1351-36ed-4104-a999-345ebbc62214", + "metadata": {}, + "source": [ + "## Storing Your Cleansed Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "575a774e-6913-41fb-8ff9-4d786f478007", + "metadata": {}, + "outputs": [], + "source": [ + "data.to_csv(\"james_bond_data_cleansed.csv\", index=False)" + ] + }, + { + "cell_type": "markdown", + "id": "f50918ee-e61f-46b2-b0c2-1ffa2c62bbc0", + "metadata": {}, + "source": [ + "# Using Python for Data Analysis" + ] + }, + { + "cell_type": "markdown", + "id": "86817f68-05a0-4235-a1c8-a5d1f6e9141e", + "metadata": {}, + "source": [ + "## Performing a Regression Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bee6d6cb-e418-4c1d-8b75-604b9ab2e63d", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install matplotlib scikit-learn" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27d0a3dd-e71a-4b8a-883c-40cb5c001f7e", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "\n", + "data = pd.read_csv(\"james_bond_data_cleansed.csv\").convert_dtypes()\n", + "\n", + "fig, ax = plt.subplots()\n", + "ax.scatter(data[\"Avg_User_IMDB\"], data[\"Avg_User_Rtn_Tom\"])\n", + "ax.set_title(\"Scatter Plot of Ratings\")\n", + "ax.set_xlabel(\"Average IMDB Rating\")\n", + "ax.set_ylabel(\"Average Rotten Tomatoes Rating\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "669fb9d7-d744-4e6b-899e-a69aebec53ed", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import LinearRegression\n", + "import matplotlib.pyplot as plt\n", + "\n", + "x = data.loc[:, [\"Avg_User_IMDB\"]]\n", + "y = data.loc[:, \"Avg_User_Rtn_Tom\"]\n", + "\n", + "model = LinearRegression()\n", + "model.fit(x, y)\n", + "\n", + "r_squared = f\"R-Squared: {model.score(x, y):.2f}\"\n", + "best_fit = f\"y = {model.coef_[0]:.4f}x{model.intercept_:+.4f}\"\n", + "y_pred = model.predict(x)\n", + "\n", + "fig, ax = plt.subplots()\n", + "ax.scatter(x, y)\n", + "ax.plot(x, y_pred, color=\"red\")\n", + "ax.text(7.25, 5.5, r_squared, fontsize=10)\n", + "ax.text(7.25, 7, best_fit, fontsize=10)\n", + "ax.set_title(\"Scatter Plot of Ratings\")\n", + "ax.set_xlabel(\"Average IMDB Rating\")\n", + "ax.set_ylabel(\"Average Rotten Tomatoes Rating\")" + ] + }, + { + "cell_type": "markdown", + "id": "b38df412-c320-49fb-93ae-e253405537a8", + "metadata": {}, + "source": [ + "## Investigating a Statistical Distribution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "938e5942-e57f-4e41-99f1-215cfb37d0df", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "length = data[\"Film_Length\"].value_counts(bins=7).sort_index()\n", + "length.plot.bar(\n", + " title=\"Film Length Distribution\", xlabel=\"Time Range (mins)\", ylabel=\"Count\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff4e9955-baf4-48eb-b032-fbf55f439194", + "metadata": {}, + "outputs": [], + "source": [ + "data[\"Film_Length\"].agg([\"mean\", \"max\", \"min\", \"std\"])" + ] + }, + { + "cell_type": "markdown", + "id": "1b14c433-c3a6-4484-bc0a-26825bd1e870", + "metadata": {}, + "source": [ + "## Finding No Relationship" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2bb83374-347f-4cf6-bc21-8180a003371d", + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax = plt.subplots()\n", + "ax.scatter(data[\"Avg_User_IMDB\"], data[\"Kills_Bond\"])\n", + "ax.set_title(\"Scatter Plot of Kills vs Ratings\")\n", + "ax.set_xlabel(\"Average IMDB Rating\")\n", + "ax.set_ylabel(\"Kills by Bond\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/python-for-data-analysis/james_bond_data.csv b/python-for-data-analysis/james_bond_data.csv new file mode 100644 index 0000000000..d8b675dddd --- /dev/null +++ b/python-for-data-analysis/james_bond_data.csv @@ -0,0 +1,28 @@ +Release,Movie,Bond,Bond_Car_MFG,US_Gross,World_Gross,Budget ($ 000s),Film_Length,Avg_User_IMDB,Avg_User_Rtn_Tom,Martinis,Kills_Bond +"June, 1962",Dr. No,Sean Connery,Sunbeam," $16,067,035.00 "," $59,567,035.00 "," $1,000.00 ",110 mins,7.3,7.7,2,4 +"August, 1963",From Russia with Love,Sean Connery,Bently," $24,800,000.00 "," $78,900,000.00 "," $2,000.00 ",115 mins,7.5,8,0,11 +"May, 1964",Goldfinger,Sean Connery,Aston Martin," $51,100,000.00 "," $124,900,000.00 "," $3,000.00 ",110 mins,7.8,8.4,1,9 +"September, 1965",Thunderball,Sean Connery,Aston Martin," $63,600,000.00 "," $141,200,000.00 "," $9,000.00 ",130 mins,7,6.8,0,20 +"November, 1967",You Only Live Twice,Sean Connery,Toyota," $43,100,000.00 "," $111,600,000.00 "," $9,500.00 ",117 mins,6.9,6.3,1,21 +"July, 1969",On Her Majesty's Secret Service,George Lazenby,Mercury," $22,800,000.00 "," $82,000,000.00 "," $8,000.00 ",142 mins,6.8,6.7,1,5 +"March, 1971",Diamonds Are Forever,Shawn Connery,Ford," $43,800,000.00 "," $116,000,000.00 "," $7,200.00 ",1200 mins,6.7,6.3,0,7 +"August, 1973",Live and Let Die,Roger Moore,AMC," $35,400,000.00 "," $161,800,000.00 "," $7,000.00 ",121 mins,6.8,5.9,0,8 +"July, 1974",The Man with the Golden Gun,Roger Moore,AMC," $21,000,000.00 "," $97,600,000.00 "," $7,000.00 ",125 mins,6.7,5.1,0,1 +"July, 1974",The Man with the Golden Gun,Roger Moore,AMC," $21,000,000.00 "," $97,600,000.00 "," $7,000.00 ",125 mins,6.7,5.1,0,1 +"April, 1977",The Spy Who Loved Me,Roger Moore,Lotus," $46,800,000.00 "," $185,400,000.00 "," $14,000.00 ",125 mins,,,1,31 +"October, 1979",Moonraker,Roger Moore,Lotus," $70,300,000.00 "," $210,300,000.00 "," $31,000.00 ",126 mins,6.2,5.7,1,12 +"June, 1981",For Your Eyes Only,Roger MOORE,Citroen," $54,800,000.00 "," $195,300,000.00 "," $28,000.00 ",127 mins,6.8,6.3,0,18 +"March, 1983",Octopussy,Roger Moore,Bajaj," $67,900,000.00 "," $187,500,000.00 "," $27,500.00 ",131 mins,6.5,5.3,0,15 +"October, 1985",A View to a Kill,Roger Moore,Rolls Royce," $50,327,960.00 "," $152,627,960.00 "," $30,000.00 ",131 mins,6.2,4.7,0,5 +"May, 1987",The Living Daylights,Timothy Dalton,Rolls Royce," $51,185,000.00 "," $191,200,000.00 "," $40,000.00 ",130 mins,6.7,6.3,2,13 +"May, 1987",The Living Daylights,Timothy Dalton,Rolls Royce," $51,185,000.00 "," $191,200,000.00 "," $40,000.00 ",130 mins,6.7,6.3,2,13 +"January, 1989",License to Kill,Timothy Dalton,Aston Martin," $34,667,015.00 "," $156,167,015.00 "," $42,000.00 ",133 mins,6.5,6,1,10 +"September, 1995",GoldenEye,Pierce Brosnan,BMW," $106,429,941.00 "," $356,429,941.00 "," $60,000.00 ",130 mins,7.2,6.9,1,47 +"July, 1997",Tomorrow Never Dies,Pierce Brosnan,Aston Martin," $125,304,276.00 "," $339,504,276.00 "," $110,000.00 ",119 mins,6.4,6,1,30 +"June, 1999",The World Is Not Enough,Pierce Brosnan,BMW," $126,930,660.00 "," $361,730,660.00 "," $135,000.00 ",128 mins,6.3,5.7,1,27 +"August, 2002",Die Another Day,Pierce Brosnan,Aston Martin," $160,942,139.00 "," $431,942,139.00 "," $142,000.00 ",133 mins,6,6.1,2,31 +"February, 2006",Casino Royale,Daniel Craig,Astin Martin," $167,365,000.00 "," $596,365,000.00 "," $102,000.00 ",144 mins,7.9,7.8,3,11 +"December, 2008",Quantum of Solace,Daniel Craig,Aston Martin," $169,368,427.00 "," $591,692,078.00 "," $230,000.00 ",106 mins,6.7,6.1,-6,16 +"November, 2012",Skyfall,Daniel Craig,Astin Martin," $304,360,277.00 "," $1,108,561,108.00 "," $200,000.00 ",143 mins,7.8,8.2,1,26 +"September, 2015",Spectre,Daniel Craig,Aston Martin," $200,074,175.00 "," $879,620,923.00 "," $245,000.00 ",148 mins,6.8,6.4,1,30 +"November, 2021",No Time to Die,Daniel Craig,Aston Martin," $160,891,007.00 "," $759,959,662.00 "," $275,000.00 ",163 mins,7.3,7.3,1,14 diff --git a/python-for-data-analysis/james_bond_data.json b/python-for-data-analysis/james_bond_data.json new file mode 100644 index 0000000000..852810b38e --- /dev/null +++ b/python-for-data-analysis/james_bond_data.json @@ -0,0 +1 @@ +{"Release":{"0":"June, 1962","1":"August, 1963","2":"May, 1964","3":"September, 1965","4":"November, 1967","5":"July, 1969","6":"March, 1971","7":"August, 1973","8":"July, 1974","9":"July, 1974","10":"April, 1977","11":"October, 1979","12":"June, 1981","13":"March, 1983","14":"October, 1985","15":"May, 1987","16":"May, 1987","17":"January, 1989","18":"September, 1995","19":"July, 1997","20":"June, 1999","21":"August, 2002","22":"February, 2006","23":"December, 2008","24":"November, 2012","25":"September, 2015","26":"November, 2021"},"Movie":{"0":"Dr. No","1":"From Russia with Love","2":"Goldfinger","3":"Thunderball","4":"You Only Live Twice","5":"On Her Majesty's Secret Service","6":"Diamonds Are Forever","7":"Live and Let Die","8":"The Man with the Golden Gun","9":"The Man with the Golden Gun","10":"The Spy Who Loved Me","11":"Moonraker","12":"For Your Eyes Only","13":"Octopussy","14":"A View to a Kill","15":"The Living Daylights","16":"The Living Daylights","17":"License to Kill","18":"GoldenEye","19":"Tomorrow Never Dies","20":"The World Is Not Enough","21":"Die Another Day","22":"Casino Royale","23":"Quantum of Solace","24":"Skyfall","25":"Spectre","26":"No Time to Die"},"Bond":{"0":"Sean Connery","1":"Sean Connery","2":"Sean Connery","3":"Sean Connery","4":"Sean Connery","5":"George Lazenby","6":"Shawn Connery","7":"Roger Moore","8":"Roger Moore","9":"Roger Moore","10":"Roger Moore","11":"Roger Moore","12":"Roger MOORE","13":"Roger Moore","14":"Roger Moore","15":"Timothy Dalton","16":"Timothy Dalton","17":"Timothy Dalton","18":"Pierce Brosnan","19":"Pierce Brosnan","20":"Pierce Brosnan","21":"Pierce Brosnan","22":"Daniel Craig","23":"Daniel Craig","24":"Daniel Craig","25":"Daniel Craig","26":"Daniel Craig"},"Bond_Car_MFG":{"0":"Sunbeam","1":"Bently","2":"Aston Martin","3":"Aston Martin","4":"Toyota","5":"Mercury","6":"Ford","7":"AMC","8":"AMC","9":"AMC","10":"Lotus","11":"Lotus","12":"Citroen","13":"Bajaj","14":"Rolls Royce","15":"Rolls Royce","16":"Rolls Royce","17":"Aston Martin","18":"BMW","19":"Aston Martin","20":"BMW","21":"Aston Martin","22":"Astin Martin","23":"Aston Martin","24":"Astin Martin","25":"Aston Martin","26":"Aston Martin"},"US_Gross":{"0":" $16,067,035.00 ","1":" $24,800,000.00 ","2":" $51,100,000.00 ","3":" $63,600,000.00 ","4":" $43,100,000.00 ","5":" $22,800,000.00 ","6":" $43,800,000.00 ","7":" $35,400,000.00 ","8":" $21,000,000.00 ","9":" $21,000,000.00 ","10":" $46,800,000.00 ","11":" $70,300,000.00 ","12":" $54,800,000.00 ","13":" $67,900,000.00 ","14":" $50,327,960.00 ","15":" $51,185,000.00 ","16":" $51,185,000.00 ","17":" $34,667,015.00 ","18":" $106,429,941.00 ","19":" $125,304,276.00 ","20":" $126,930,660.00 ","21":" $160,942,139.00 ","22":" $167,365,000.00 ","23":" $169,368,427.00 ","24":" $304,360,277.00 ","25":" $200,074,175.00 ","26":" $160,891,007.00 "},"World_Gross":{"0":" $59,567,035.00 ","1":" $78,900,000.00 ","2":" $124,900,000.00 ","3":" $141,200,000.00 ","4":" $111,600,000.00 ","5":" $82,000,000.00 ","6":" $116,000,000.00 ","7":" $161,800,000.00 ","8":" $97,600,000.00 ","9":" $97,600,000.00 ","10":" $185,400,000.00 ","11":" $210,300,000.00 ","12":" $195,300,000.00 ","13":" $187,500,000.00 ","14":" $152,627,960.00 ","15":" $191,200,000.00 ","16":" $191,200,000.00 ","17":" $156,167,015.00 ","18":" $356,429,941.00 ","19":" $339,504,276.00 ","20":" $361,730,660.00 ","21":" $431,942,139.00 ","22":" $596,365,000.00 ","23":" $591,692,078.00 ","24":" $1,108,561,108.00 ","25":" $879,620,923.00 ","26":" $759,959,662.00 "},"Budget ($ 000s)":{"0":" $1,000.00 ","1":" $2,000.00 ","2":" $3,000.00 ","3":" $9,000.00 ","4":" $9,500.00 ","5":" $8,000.00 ","6":" $7,200.00 ","7":" $7,000.00 ","8":" $7,000.00 ","9":" $7,000.00 ","10":" $14,000.00 ","11":" $31,000.00 ","12":" $28,000.00 ","13":" $27,500.00 ","14":" $30,000.00 ","15":" $40,000.00 ","16":" $40,000.00 ","17":" $42,000.00 ","18":" $60,000.00 ","19":" $110,000.00 ","20":" $135,000.00 ","21":" $142,000.00 ","22":" $102,000.00 ","23":" $230,000.00 ","24":" $200,000.00 ","25":" $245,000.00 ","26":" $275,000.00 "},"Film_Length":{"0":"110 mins","1":"115 mins","2":"110 mins","3":"130 mins","4":"117 mins","5":"142 mins","6":"1200 mins","7":"121 mins","8":"125 mins","9":"125 mins","10":"125 mins","11":"126 mins","12":"127 mins","13":"131 mins","14":"131 mins","15":"130 mins","16":"130 mins","17":"133 mins","18":"130 mins","19":"119 mins","20":"128 mins","21":"133 mins","22":"144 mins","23":"106 mins","24":"143 mins","25":"148 mins","26":"163 mins"},"Avg_User_IMDB":{"0":7.3,"1":7.5,"2":7.8,"3":7.0,"4":6.9,"5":6.8,"6":6.7,"7":6.8,"8":6.7,"9":6.7,"10":null,"11":6.2,"12":6.8,"13":6.5,"14":6.2,"15":6.7,"16":6.7,"17":6.5,"18":7.2,"19":6.4,"20":6.3,"21":6.0,"22":7.9,"23":6.7,"24":7.8,"25":6.8,"26":7.3},"Avg_User_Rtn_Tom":{"0":7.7,"1":8.0,"2":8.4,"3":6.8,"4":6.3,"5":6.7,"6":6.3,"7":5.9,"8":5.1,"9":5.1,"10":null,"11":5.7,"12":6.3,"13":5.3,"14":4.7,"15":6.3,"16":6.3,"17":6.0,"18":6.9,"19":6.0,"20":5.7,"21":6.1,"22":7.8,"23":6.1,"24":8.2,"25":6.4,"26":7.3},"Martinis":{"0":2,"1":0,"2":1,"3":0,"4":1,"5":1,"6":0,"7":0,"8":0,"9":0,"10":1,"11":1,"12":0,"13":0,"14":0,"15":2,"16":2,"17":1,"18":1,"19":1,"20":1,"21":2,"22":3,"23":-6,"24":1,"25":1,"26":1},"Kills_Bond":{"0":4,"1":11,"2":9,"3":20,"4":21,"5":5,"6":7,"7":8,"8":1,"9":1,"10":31,"11":12,"12":18,"13":15,"14":5,"15":13,"16":13,"17":10,"18":47,"19":30,"20":27,"21":31,"22":11,"23":16,"24":26,"25":30,"26":14}} \ No newline at end of file diff --git a/python-for-data-analysis/james_bond_data.parquet b/python-for-data-analysis/james_bond_data.parquet new file mode 100644 index 0000000000000000000000000000000000000000..88bd22b4fb36adc606eaf6eacd5b46d56121e5d1 GIT binary patch literal 11050 zcmcgyZEPDydY+Y*HffWwBx@DYk?cK1W)x>7?FYZsa9k}Zks?D%7A29CF0HU6R}`h? zE{`9Q=rl!8^om?AIQRnR;5!tDG{}#jz4S+s25s6O*Zw#J2M6en^iZHd5geLJa43qP zD1stro4zxoBub?6f!HQIyEE_1ydTdy^Ugaf-%yeQLo=_QVt(5jXG8`KAoRoDr_UgS zVs%5~SV8uQc4#T}LcLnE^bJckXeHE!#o_UsgX)r2u4$#LRiYTDcUdi))DQOCMV$Op(gs2a;2zSCRo1%rY+=ZrlUmrW!t3f&*^|b zO`=5qx9Na%brR75@5tYxfF(W{Q;oV>vHX}0$icgamc6!h+uKNXykjfTvOM`W$S(4{ zNd31xtYs^87VT1QrRUsWh*iUNa}0CSCFd`us6ia9mx zL*wTTOygJ>>2;nZ3S&(Ce$aSW?}4X}ve?+?l^0~|<-f{Q|Ksn+AA94|EYRI=T80_B z%ZpC+xtynyny#r;jp-Tk4SjHW=w}x9OSIeikJ-<;j3R>kXw3avPMqaCA zRb4+1%~i9`E*N@)jTd(`c6qm$(=Gvd!C>dK3Y$>3v}&#KSe0GUauuxxugVS#8H6D- zMYUua`6{baG&XEjv>mO|2Xsr6SB*RyhtV@d?GZ%P`!rx=*zVYCn;J{qc4`J2sT+fi zOU|WogH3OmmbYf|Y(gVT%ruRPx~)|Ps1G=^!h(JkcDA8aEpJkfmhoySgNY3+NMTb& zZI`W?tja!B)b;+SH^B%nl$}uqxQ{$xRI;I$i4t3|_PlV(HJ zwSFKh)>WfcFR|tZyJYHWPCE_6rR~NBfK94pEmx~(&F#W27fTw}9yoVdhUdF@#@Tn= zX5jxN>+!vFhwIUjFn2&0UX8EYhCZ>nUwO#=Z)DMbMQL5vRJNKuU8s0 z42JEN@1-Wj6Cn$<;%2R0r5<&LinWTV8Ps4rsBWoSO=NYwidSe(qaizMx(N|&>l+Pe8@qS@?8*~Qa2ycbf8;uZ$&O)K52=;)L^wj= z`Wb=iMW_4M42J6n+pg$*cAB|E?+3U4?TzCN&W;H_j`#UFUh+=zJZl>q6Fppj=Q*C| zU4wS7zzNp}k=-Y8K6uj}DLjs7Q4HT8b?A}2oaZ{UM1gvqqFpc29$)wxvio^Xx_%AO zUJp#Lg8(@=j)s9KLe)2tr#^wB$G3-A$;0`ueBm;O`X8Sj<9(b*lsVZWP=D%R#zZeC z@g7d}`z9IW^l`Gpb3R+aab?WM!yu6pBzfe@AaIa;-u`F%udp)I0>H!Xxo>s19|*B!KTr0>dA(hYqx^ zEUiaiUsZ|oeqkH>*y8@xMfXoHVCQ`$ab2DIBP((ZIeusl zfWuvn+n7F4z&>V>Rgk@ySHRDCffwr)IUn)zm?B$D-4?!D{4E3L?0Z?5`M_n z;o+UUhETwF5n275IC&8vfB!|=jx2ky7a$Hu%=c_8Ee3onY-E3b2^ta)4dl7rhnQd5 zc;sVP+JP+3A>n#Iq9t#C(lP1^IrmAf##ii>Z4a)8`^&NN#}>J{nPJ3ulp}14k11ltmg}m1dWfeV0ra%vbEkiJ@i?x z`!?r(^KlIG$)dU<`^#SDJveFI1lbQM&I&RGfoDraqiVAwugi|4Nhjdva?&2rVn;UW z8ZGi8mmvUJT$U&jN20Xo8*z?Aku<{7lH?kr1=(c>bm8Ig00Vwx8F@yoLE)V~iLma1 zkyU8*Ijs}AYo4UX!!Q+fvoLUKYE6bVRU{g9HMhJT+~bJam^^yHVHESMGyU9 zuls%8{qZD5rkjASn0!Dnw*Zr2f=L-k7Rov@jVu&>aoTc@mM%H^?8DPP`7uSIM;uQ3 zBh&BRh3W-|6*(Q_@b2s9k;{P#JLZIO_;+dgxW!KEevcDl4nODgP|tW5Ds&C&_>^ zdjD^Lf`e0BPHm+3HtxLrz8fFb?oYkL%$e2&!J+IF)>o=pWj&gh2@)LsvUlh=J?@`- z+_$9RuOYT8Hm~$Be+Sr%5o~TzG`8J(k)Xp)+xo2RG{6P8SiyX!{Q`l>`tbAy5C+qtk5dr$udg1W)^~tfvSzG9B2R$(tas=qkfF)$cl-cc zryT6BPAcjO+zloWXU-7V#_1lo>>)%s&Ry#TP4Mrt&znCm`bg60wC;~^TKC5c!CLO5 z0+Yk^^ZC)s-@8rU8ha6rB=?)whZ(xHJgm(mu-H77f6z1Z8Rh=LlkT^#fq|VoPmt8r zz<=yv{sC6T6=LG+6iq{@Kxb{I;SH-{5DEju0_8LmD-;UKB`9_%15nOEaX>i(WeUng zD0n6HIjq+%5R?$YnLqw@Zl|ec2Xa~gTJpo%cT%CNOBz#zkh^s2d=h%yw?l_`ox#_@ z>iR*_{wh6mr`P?jiu-?l2N3JzeuB(yh`HYPBIl`UZc{6%j1w{T%RL9z`^zXk45|=X z^NQ6@A$N#&!jWvbz2hJYYbUm!)a~@+$4h%(dvM*Sp8JDDHkVfG--lLr%n zhkiOc68Hx(zg_jaF3n z@;S<}qlQ}2fMC>h9a_~Id|!bkPwHkFs&FmDbPd;qXMYzOO@`})yVl}+l+6ZwfXc5Q zr8V_huQke8-e}z@=FEIkg?g#nIOs65Tkts$t5(u#YF@3WF#MTO&4A(X6%)^0Sq@JG zMxSj{=BO58a*yi?dd_1??c|A^IH zW_$0wmA(J{rBiNu&%Kq|_6eXg8O zWp{KRlfI#wYR%`t6OAL$q^+YZoOEp81OBb6(-&yaHsT&Dv45@U6yDx*OlZ;QYWD&8 zi>yAp7j%j|+V>5Gl2fCisWr1gg1KJIR@6!Z$MSMR#eZ+Y3cw$nszuYlRKYXJPYRcm5*~o4+8j6+FbgH@*mg}o&LC52=Myi?(0Usj>dVy>~=Cw?a^hJ$ST}{vLtVLFe zST>BU<>vH_{G7f^_=aLTtEIBODlInq(6`*Gf9HlN$#%gO;>Qqr3P48fBa%w9dBK@_sOmY)=@T&)Y+>_%WjP0S!#gbfKYmV>mZ*)7hlPxvZ#=npKoNP`I{%{mrBiVB&FC+PD(sbf2Ksaf!BJw z>@Mp+la*q4FVE#l^37a>td~|yns48uLbjxDWyR!v%n`DgptpTZX2sZd*3$EQ69+-h zN5T--lA8zdVUDjx=i1^)C>F}6lg3(RvG7!Av5=Tic5{twf>9SePaON8)?>1ySF>ba3del z>uToYeGX^Xq1?`Mu$(o%9*#>QtjW3gjW3SfwQ@GHcQeC(aSZ>ay0!9%##%beXQgQC z?A|M{LavfCw#m5#=RVY9JNagu5n=t^RMT>uoaDkKe$L znf?4tQT#WgsVUrCj1sOmkvmNFU=~i@W{=>gD2bamX~YAAWKwA6iDr8?1kzv`J$`7oAq3wHqDIf7}q#|++_b3T} zOTVb1-+(%?FF};TiEg`nGnhKtw$FHdJ~v&VJ0!PGc^T`U@%EYIs7E5 zkfsvHuVb-N*~Rjww)X7_H|-%4+VH_x2D`<1C4lEQwp#YgHs=Qq{YN+{RXqRbdA6(M zVA53T20V!UE5!Z||6&H=!?T?5o5k{;Q1E=rxMeRvqRn2i{*LD$Wd0nPzk6&xPWs#N zBKDK{WKyVr$BW1GjgUT~I;oKbGTY1{F`S}ApjatE zKA#TXTwBB&WwE8-lTRWiTUIOIscW@~T)8|!E?jKuLd7<(TCThBx0)~ghyK%r&<*%s Gvi}c^zcEVy literal 0 HcmV?d00001 diff --git a/python-for-data-analysis/james_bond_data.xlsx b/python-for-data-analysis/james_bond_data.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..1e042705fc38619b41b62bd98b1c1a807c061cc0 GIT binary patch literal 11269 zcmeIY1zQ|j);8QofW}>eLvRi5!QI`1ySqzp2m}opg1fuBy9Em#92$3d`<$8i&P>jH zzu=kb?%G{_?R#BaweO{SDat@WV*+3RZ~y>+6kv3kX`v4R0K`E902lx`NF5P-I~P+s z7XwvK2UBM~Mh{yXk{oDA>TCccc>e#6|HDt9EOAu6ml;*;R`N+?he3L^MhK4mBxn$w zR!N|%FJYj>SSQQU@--{s5mop*mKA3;`sAuN>-o4vjh$^nQ20QrGGbKVP?wrM0Vh-c z;4x(<4qm*IwiYNGlR$_C&%hwYEFJLGv9VW~O?2~vbQ!({7PdgZcINV+I@Ucbe%rBK3<*FpQEl6A6huDEDRWQ zvV;OjgJu>1iq=te*yXHtlfHd|Sh=#JXFVi8A1OC6-uA5%n6zZY#+k_DlG)O4=H&D?joh|4LjKwI^gv_p8BN| ze0rXI;LrL30|30fLID*2LoMson8~leT$2TB9RgS_4V+AEoS7K^IR8%_{}1cnzukIS zyqrQWGh!%6@+oZScK&A^s<4dv2gz1aRlfkKRh0UeTnd7pom6j8Rq=zNB>da`o<>%F z^2Hnuk=|^vl|`Xr@{%{Wmxrc2IJ&~pzH>?zcP!iNLvx$Ip1(9$MANnSGcrneP81x!?dv zFq~WVcqSEV$l1tzslsQ-hV=R#Uroh=+p@|i%ZZ1~)4;^0>rynm6XV&3K|W(hnT!qn zj&)LEkTUz)U%QU&d^FRuj}zEiI&?N16h&OL2KG>YSCU{9iLi9Ag1~`Q2nGCPJZzZU z?VYTR?Cq`p@LPGRathnbsO^8Z?4wkPh%3cif9|-DXJJhq`D7-2 zfotE~y%!8xpIDF~Ty! zsiS`KlMm@*7tRw5`kpM)eWzK?x;L{xl4d>!>6;MQ!)C$#2&XH5ZLH-X`|FJmz398+ z2FQMD^Jx>xq=G2Kp0zl}Ws4>e4@rPXDK}78LgH7DrRm&E7NE``K_K!Qv;ZkA&Eng= z4P!|{gG;JSt0wdOUWFR?Tg^>6Twq7ExR7>|I+^4e)pU!u9RnYiPGt1@j~TH#YQO1K zHQHY4+yVV}Jr@)M{LJ%d#BEciEVN=UYwC;ku2q7Iqn;(*0m~@wN9*&F4X7mwM=$XMc zNLr{p;b$yQplZU6E-T@e`%BBk(*!rpE3d-dtRDyN;`*IpE&8BCr@52sux%MuuRuc= zToy5z^wY@WY~$6yNTTM7gM#xo5?Zkf>22;sy{z{Do8#rB_6_SxT3GX(@!+@! zZ4o#aBFWV0R4#mLQ@PGR#Qo7AW!1yOFWHWc7C5w7m~0c31WYmc;{a^sjX_DO&8(E5t`1G z3;NM3klD`g;QTsxrT^U$AYM*Nex5=F|I6H$*IRiO9w;;8Zz`#5Lq?HouJ$btF0KVS!MVu&rY1R;b(a@ z6Q~8L-1qqIv?wSHg5p^2@`jah5cv~vVX z$`JMm%#J7%x1#%*m#;B+I1)4Yucge6-?SEo90I6RFFND5y1yi47`FTOgi2^GGNxCU zW=(_kL5CZ>$Z{@L%gw|*nLFqTTuo%ykjR`@DbYM|Zg>dfy-a7QUn6sdPyoP&b+A z8peW|f@1s@Wdm9q1FLlyP~7?F0{(eQOk1%#;y@MiryTMoRB7*<%g{f+Q`=1tgX)gb{gF+h8r3-$CPBr^jx7l zTRPN;DWE9X6(s3iRZE{QdoNX1g&UtGqT4&4D)rQ2Gd)aCkZ;rzI&c9_wBh&-nJF@4 z>N!mr_dMK=DtVkrjF704=HQTXN%L}Z0}L-Yaalr1{6%_Bmp4df!Dn{)Lp{y#r!*(}d^(ud zC$fa1=#DA&La+Rcqk41rY}<47;KDsqH+JJw3B}1>oqG|p=;_Hht?%n&Pat?}Q_u@> z4jg+<-drTvTsRp&fzl^@r95-;$U63%rVMI`r%vkb$#C>BZlw|0wM(=?3V7ik)M7Ko zA==)3NYfWLkM|59??oNA2eOm*1F)t5fVhc?IOK|szd{OPl90kVT>J!r|7XD!27 z0)C9Y*pM#dCT5N=$DT+4aCryu6&AHiJjQz5+N=9f~>r7A`t)7Lpu=$57u4o`bx1cC`CCI!YXR*TgkCz)JLz)j3BsX@1K5 zb|*C6TiejiDli7$a?i{pny!rXg62czg>g*6bW&Yzu0! zjJ(;~dj0ft%{m&)tEbxZvYMW@_=^SIUl8dRBHB9a$~N;;668xdeE7}F5XM_afnUVI zD^QfZqJkWahRbgBQGj1?Ojlu+$>8N{+5tjdkih zrgHlf_cCpv|+g2ewuU$ZSgw!Ffv7o7JpHOrOKoMDOwXJ!4{>{o#_Xf@e0W6 z(fWipDM|Mg6;F4?=e51mLxhro67I&_392zjwAjbkR4lYA`j=@|0^%A{vt)IU1X`si zBr(L##PTwA+L$EFS$ZKn`M~3o&A3qLYxZG7XX&=l4Hgw;rcbhOEoSn7-bSCD+naB`HYP+hr*8l#S+fQDiw zTiULDIdT)-0l{1!e&p7gY3C|F32?v-Do{^m9oM#RU(TAL4?O-Mwwx*5uIzkH`CyH} z%DHml)~+>V6oDW2tgX{ksexZ#t_Eu_gl%BZ^dUATztHrK(WVy zLtDr??8fWOHJ-`P(r26wC;9>E(k1g)4OA(Z0?!-`Y7^+71khj1$M{20f#_&;5zHzuz;vbY3XQO?E@0*sZ$4~X?CNr{#vI8fKnaf|{bEbD{7VZZlxM5S=3R2!`db|>L2pjk;c?p=dAIK?MmVcJB zylF8SqgLH^@AvTHvF)DTk%`E}p!C2yHPG~EJm*>=M?H#yoosD+tT=^vy>DryPnp!V z-Ft(bw&Q7F)3W1YnZTD9O;`jZ=gdDypnQw1gn}&xWna*=G?hqoR!@D3%!s{s{2{0` zwn!QoJwqf`kDc6r9$}_hlskoaCDru3PdrB5IG5D-agKzKdXtC^M3QB0o+c?jMrt>s zqf;@B-}}qsm0LoZn>M)pI{B(rt|~LWTcXaMld8N|#G!)c(^P~{KT!tMg9}=FQ??p_ zrZ@CFy%jG{-nm35%`)BjVsk*Pf=f6LlkxSYfKo=DS#P)~{4_sJJMZ>&xZ=<~k4=rgv3iBjJ?~EZ=uZ)_ z(>TGd!}$s>sr)^A4oL5TBZ2||u3!NGq<^?2XBST!Q|CX+hH5KzE6j+##8v*NSEnOx z+w?#i961%JH!UB9cnvEf8K{I^SxoYsd5+g5?&F&*8WERd$du-$Nl~N*HDWOLQ&ZEd zlQuSb2E&VWadd6bpH5gV&2Z9bHG^);1~d)^Q5);F5cF8yP~~&PYcHlD71Q-vD7DHc$4%W9iYiw)gjL1ecLqpWWw|A#SZgVb{?K)3 zwXbH5zu&UUt@~~bp_*^ujpf{`F)Hbt-l-1G;O%#Vqq~pA@5wv?UPx2gVfN$e9LKX0|M*ikzlOsos@}|N*L-eQJQu3gqKf|54 z)IxI}3!8MKn2zlog03rJaxZr<1hnzO#j2K2V#7^Z%_N|YPx`pDjtKpe?mER?6hV=Z zsB`;vkJGihG(@e%XR`ZKxuZ~Hy7f)0sS-oVpTI{CF4;53=Ts~osy*1s$|tQAY_jgb z>-M84!^1VhnD)x=UpI+Q5EafQSAG@VA95hHr*>v|CdgoWURpF?or>;tuwR=8$K6`2ro~-+Z=|kKHpwp9XVdsCX zz+-mRYITDH%2a#+0P~-UWbx6-)I`O_$cqGb#^-KyU98!IH`qLZ4I0$;7MY!rq z+*Qs%JX7KllBNF@8Ff9pVdr=6%ptU_Se+Asd2CW;>~&`M^z1&UO#tm4B7kQTz0^U7 zeC^aLT@t&K4D`65R+B<{Dq0X}SBtK=3T08j?xjW?uFBIzoKjZr$w1736DUA`lIfqg zrQ)ItKCj;0Hh$k>`X)Dx5M}(%LzDO_xppK)QBR_Dv!IAOe6iDBI=jwh%*Jsaop32`@;_UQ!h8m(W08^ z%N>eYbsdi6{C|mk$I)~2FM~!vosgP)(ZM-{(;K}xh*k=m$VZhuDKk&HaWA8Zc%y)|&TSsu-Xm!g(K+2rLzp zI%+Q%v8{kR*rW}>*GUY7=Grnx&W=imn%9A}2IFTH^?h+nUbamI#PM(UGd;Jur+MG` z-+$=~OT^iRx_~|2IJ~OE*dQ)7ial$~3?T^g`v@QK6k%EW_#qchhc^2P?@U8NyaLw( zR|MmM69WbNy?m;+I`&5%ASq7(YE3~i|Ca>rFA1W^!iuAEKgnnsu7YOKMOAZE`ls?$ z5#I*qqY^wEC!#dXCYx%60+e48Bdq(+sg_$vdHbOzI=>SY=M%}v1gZ#Q1k8lk%GpsW z(xypEeiJL3z05+wi4E%e<^?Ap8kzx!JS#;?(T-(cophEwxuPBbgUUyEJrqsoL|PLq1NS_{pUr*53w;KylQsrQ;7tSzE_gVM3tbjlG6NQ@7Puw&_n3Emfq24=~jIfc}V5h%0=_tesXj}kWPg6V$`<$ z?&CUKqRz(JD2syu!F;EqLnC5lcckc)TgagK>|>VLASp1O8s!37UACPQlEq~3728qc z*_3R^wr*6Jmh#)QtF5i|?Se*6w;13()rUm%Z$n#N8RzY9c#&=szje}jHiuj+kFx17 zUVZ3g9m>8HRzJF#7+H!#13G16ZT0K3;7-pyV#CZHEnkQ(l?o90w6#BNA!fppRJ2Uh zf!@JFl{s1Pz1<^nmG8>CTkkoA=`=yHn$*3!zb;^N7lt{ zj&p8bm)5A%H1VEv{MP@#C6AMWw^-To3PBbwpPrduaa}ZLcQXBFyU_pksKCB3I$qB@m>Dw^1p9~*?;WG0 zzL1T5ql8<=tT{MvD(qxs`Yx0r{d%3r>WF@&pK<(g_nbS-q%^oJn2EW;O7*iOW-b8J zUNnE?ZM)5nO?LP)W9h0ErMO&VdTp!}9K0WioG@9kx!q0hHXBkS$`#Ia?(!ml(Ppl zSgTSv#)p3TM6os?lg4P_b-nibaO}s#MKqM#^%^0QMsMDSk&W9nNi22_j?4cZxC3Ds zNTR^x!v<3m_20?&(ZS(=+yj&DpGQXgI5>wv{RDdRfD8#@`Beu8A8e#Z!Qf<}1f?E- znXIiPO0B$M2i>Y*pe$_Dd{W%_jE5+QTx^HZPA4915?tBmr4|X<4ve{el;&G7s1%3@ z!u5e-^eF2?lUhBz1EV&&hKEgtsmM@4+9GJ zJejU-V;m)jfE8aibY{;$>_2Y8yZ4t%H)JK|pC2TJuvUYG(O4mJE#7HX`WDu$1(jh} z*{Op86_Omq1RE?r{7aegkTC6s#r;?Q#B) zFwMBUgk%ZvQp_lYPQ`i)H@W%>L8`j& zwcUvE*eT)lreRmPKZSSvQ7cpWFnY&$w~txZeRkvRTby_fP(bzzom1P2Vcv#5qo+t!P-re1yhvcs2Q3x5U75ReSu z;P}5cT>fjp{x$xW_De;X{}k|_tzrKH{xPP2wc_6z#eN6=-Y)YOv<+OM{jGWCckqAq zDEtKl06qc#1pohZEc`C#_l@hnBsC)Ze}Cd{+t|NL`Ms|EmlS<)8UntR->b~O3;4YP z@RxvVa4G{9@K-(Hcj)gq&tFghqJKbt&wzfH@SiU9FFXLiLID8$&7uAd|4$?M@9>Iu c{|5i3NmP`1172MK03Q4b0Q+ovnm?}o9|Ua)9RL6T literal 0 HcmV?d00001 diff --git a/python-for-data-analysis/james_bond_data_cleansed.csv b/python-for-data-analysis/james_bond_data_cleansed.csv new file mode 100644 index 0000000000..f67cf98f34 --- /dev/null +++ b/python-for-data-analysis/james_bond_data_cleansed.csv @@ -0,0 +1,26 @@ +Avg_User_IMDB,Avg_User_Rtn_Tom,Bond,Bond_Car_MFG,Budget ($ 000s),Film_Length,Kills_Bond,Martinis,Movie,Release,US_Gross,World_Gross,Budget,Release_Year +7.3,7.7,Sean Connery,Sunbeam," $1,000.00 ",110,4,2,Dr. No,1962-06-01,16067035.0,59567035.0,1000.0,1962 +7.5,8.0,Sean Connery,Bently," $2,000.00 ",115,11,0,From Russia with Love,1963-08-01,24800000.0,78900000.0,2000.0,1963 +7.8,8.4,Sean Connery,Aston Martin," $3,000.00 ",110,9,1,Goldfinger,1964-05-01,51100000.0,124900000.0,3000.0,1964 +7.0,6.8,Sean Connery,Aston Martin," $9,000.00 ",130,20,0,Thunderball,1965-09-01,63600000.0,141200000.0,9000.0,1965 +6.9,6.3,Sean Connery,Toyota," $9,500.00 ",117,21,1,You Only Live Twice,1967-11-01,43100000.0,111600000.0,9500.0,1967 +6.8,6.7,George Lazenby,Mercury," $8,000.00 ",142,5,1,On Her Majesty's Secret Service,1969-07-01,22800000.0,82000000.0,8000.0,1969 +6.7,6.3,Sean Connery,Ford," $7,200.00 ",120,7,0,Diamonds Are Forever,1971-03-01,43800000.0,116000000.0,7200.0,1971 +6.8,5.9,Roger Moore,AMC," $7,000.00 ",121,8,0,Live and Let Die,1973-08-01,35400000.0,161800000.0,7000.0,1973 +6.7,5.1,Roger Moore,AMC," $7,000.00 ",125,1,0,The Man with the Golden Gun,1974-07-01,21000000.0,97600000.0,7000.0,1974 +7.1,6.8,Roger Moore,Lotus," $14,000.00 ",125,31,1,The Spy Who Loved Me,1977-04-01,46800000.0,185400000.0,14000.0,1977 +6.2,5.7,Roger Moore,Lotus," $31,000.00 ",126,12,1,Moonraker,1979-10-01,70300000.0,210300000.0,31000.0,1979 +6.8,6.3,Roger Moore,Citroen," $28,000.00 ",127,18,0,For Your Eyes Only,1981-06-01,54800000.0,195300000.0,28000.0,1981 +6.5,5.3,Roger Moore,Bajaj," $27,500.00 ",131,15,0,Octopussy,1983-03-01,67900000.0,187500000.0,27500.0,1983 +6.2,4.7,Roger Moore,Rolls Royce," $30,000.00 ",131,5,0,A View to a Kill,1985-10-01,50327960.0,152627960.0,30000.0,1985 +6.7,6.3,Timothy Dalton,Rolls Royce," $40,000.00 ",130,13,2,The Living Daylights,1987-05-01,51185000.0,191200000.0,40000.0,1987 +6.5,6.0,Timothy Dalton,Aston Martin," $42,000.00 ",133,10,1,License to Kill,1989-01-01,34667015.0,156167015.0,42000.0,1989 +7.2,6.9,Pierce Brosnan,BMW," $60,000.00 ",130,47,1,GoldenEye,1995-09-01,106429941.0,356429941.0,60000.0,1995 +6.4,6.0,Pierce Brosnan,Aston Martin," $110,000.00 ",119,30,1,Tomorrow Never Dies,1997-07-01,125304276.0,339504276.0,110000.0,1997 +6.3,5.7,Pierce Brosnan,BMW," $135,000.00 ",128,27,1,The World Is Not Enough,1999-06-01,126930660.0,361730660.0,135000.0,1999 +6.0,6.1,Pierce Brosnan,Aston Martin," $142,000.00 ",133,31,2,Die Another Day,2002-08-01,160942139.0,431942139.0,142000.0,2002 +7.9,7.8,Daniel Craig,Aston Martin," $102,000.00 ",144,11,3,Casino Royale,2006-02-01,167365000.0,596365000.0,102000.0,2006 +6.7,6.1,Daniel Craig,Aston Martin," $230,000.00 ",106,16,6,Quantum of Solace,2008-12-01,169368427.0,591692078.0,230000.0,2008 +7.8,8.2,Daniel Craig,Aston Martin," $200,000.00 ",143,26,1,Skyfall,2012-11-01,304360277.0,1108561108.0,200000.0,2012 +6.8,6.4,Daniel Craig,Aston Martin," $245,000.00 ",148,30,1,Spectre,2015-09-01,200074175.0,879620923.0,245000.0,2015 +7.3,7.3,Daniel Craig,Aston Martin," $275,000.00 ",163,14,1,No Time to Die,2021-11-01,160891007.0,759959662.0,275000.0,2021 From b0886f293bea45c9359752d60b0770a55d85cd33 Mon Sep 17 00:00:00 2001 From: gahjelle Date: Mon, 20 Nov 2023 19:44:05 +0100 Subject: [PATCH 2/4] Rename notebook --- .../{Solution-New.ipynb => james_bond_analysis.ipynb} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename python-for-data-analysis/{Solution-New.ipynb => james_bond_analysis.ipynb} (100%) diff --git a/python-for-data-analysis/Solution-New.ipynb b/python-for-data-analysis/james_bond_analysis.ipynb similarity index 100% rename from python-for-data-analysis/Solution-New.ipynb rename to python-for-data-analysis/james_bond_analysis.ipynb From 65af430d8fa4b9c861f220b4d480481d1f4daf10 Mon Sep 17 00:00:00 2001 From: gahjelle Date: Mon, 20 Nov 2023 19:45:21 +0100 Subject: [PATCH 3/4] Reformat notebook --- python-for-data-analysis/README.md | 2 +- .../james_bond_analysis.ipynb | 46 ++++++++++++------- 2 files changed, 30 insertions(+), 18 deletions(-) diff --git a/python-for-data-analysis/README.md b/python-for-data-analysis/README.md index 8cb7b23c7f..9281b87392 100644 --- a/python-for-data-analysis/README.md +++ b/python-for-data-analysis/README.md @@ -6,7 +6,7 @@ The `james_bond_data.csv` file contains the original uncleansed data and is the A cleansed version of the original data is available in the `james_bond_data_cleansed.csv` file. -The complete code is available in the `Solution-New.ipynb` Jupyter notebook. +The complete code is available in the `james_bond_analysis.ipynb` Jupyter notebook. ## Setup diff --git a/python-for-data-analysis/james_bond_analysis.ipynb b/python-for-data-analysis/james_bond_analysis.ipynb index 6a361bcbc1..4a8ce4b46e 100644 --- a/python-for-data-analysis/james_bond_analysis.ipynb +++ b/python-for-data-analysis/james_bond_analysis.ipynb @@ -406,22 +406,32 @@ "metadata": {}, "outputs": [], "source": [ - "data = james_bond_data.combine_first(\n", - " pd.DataFrame({\"Avg_User_IMDB\": {10: 7.1}, \"Avg_User_Rtn_Tom\": {10: 6.8}})\n", - ").assign(\n", - " US_Gross=lambda data: (\n", - " data[\"US_Gross\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", - " ),\n", - " World_Gross=lambda data: (\n", - " data[\"World_Gross\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", - " ),\n", - " Budget=lambda data: (\n", - " data[\"Budget ($ 000s)\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", - " ),\n", - " Film_Length=lambda data: (data[\"Film_Length\"].str.rstrip(\"mins\").astype(int)),\n", - " Release=lambda data: pd.to_datetime(data[\"Release\"], format=\"%B, %Y\"),\n", - " Release_Year=lambda data: data[\"Release\"].dt.year\n", - ").drop_duplicates(ignore_index=True)\n", + "data = (\n", + " james_bond_data.combine_first(\n", + " pd.DataFrame(\n", + " {\"Avg_User_IMDB\": {10: 7.1}, \"Avg_User_Rtn_Tom\": {10: 6.8}}\n", + " )\n", + " )\n", + " .assign(\n", + " US_Gross=lambda data: (\n", + " data[\"US_Gross\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " World_Gross=lambda data: (\n", + " data[\"World_Gross\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " Budget=lambda data: (\n", + " data[\"Budget ($ 000s)\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " Film_Length=lambda data: (\n", + " data[\"Film_Length\"].str.rstrip(\"mins\").astype(int)\n", + " ),\n", + " Release=lambda data: pd.to_datetime(data[\"Release\"], format=\"%B, %Y\"),\n", + " Release_Year=lambda data: data[\"Release\"].dt.year,\n", + " )\n", + " .drop_duplicates(ignore_index=True)\n", + ")\n", "\n", "duplicate_movies = [\"The Man with the Golden Gun\", \"The Living Daylights\"]\n", "data[data[\"Movie\"].isin(duplicate_movies)]" @@ -721,7 +731,9 @@ "source": [ "length = data[\"Film_Length\"].value_counts(bins=7).sort_index()\n", "length.plot.bar(\n", - " title=\"Film Length Distribution\", xlabel=\"Time Range (mins)\", ylabel=\"Count\"\n", + " title=\"Film Length Distribution\",\n", + " xlabel=\"Time Range (mins)\",\n", + " ylabel=\"Count\",\n", ")" ] }, From 150da332f4a25e49dc88b0c5fb47dc6baab1664d Mon Sep 17 00:00:00 2001 From: eyrei123 <88923476+eyrei123@users.noreply.github.com> Date: Sun, 3 Dec 2023 13:45:56 +0000 Subject: [PATCH 4/4] File versions after TR2 --- README.md | 54 +- data analysis results.ipynb | 222 +++++++ data_analysis_findings.ipynb | 1093 ++++++++++++++++++++++++++++++++++ james_bond_data.csv | 28 + james_bond_data.json | 1 + james_bond_data.parquet | Bin 0 -> 11050 bytes james_bond_data.xlsx | Bin 0 -> 11269 bytes james_bond_data_cleansed.csv | 26 + 8 files changed, 1387 insertions(+), 37 deletions(-) create mode 100644 data analysis results.ipynb create mode 100644 data_analysis_findings.ipynb create mode 100644 james_bond_data.csv create mode 100644 james_bond_data.json create mode 100644 james_bond_data.parquet create mode 100644 james_bond_data.xlsx create mode 100644 james_bond_data_cleansed.csv diff --git a/README.md b/README.md index a4f169fe5c..b714e41539 100644 --- a/README.md +++ b/README.md @@ -1,37 +1,17 @@ -# Real Python Materials - -Bonus materials, exercises, and example projects for Real Python's [Python tutorials](https://realpython.com). - -Build Status: -[![GitHub Actions](https://img.shields.io/github/actions/workflow/status/realpython/materials/linters.yml?branch=master)](https://github.com/realpython/materials/actions) - -## Got a Question? - -The best way to get support for Real Python courses, articles, and code in this repository is to join one of our [weekly Office Hours calls](https://realpython.com/office-hours/) or to ask your question in the [RP Community Chat](https://realpython.com/community/). - -Due to time constraints, we cannot provide 1:1 support via GitHub. See you on Slack or on the next Office Hours call 🙂 - -## Adding Source Code & Sample Projects to This Repo (RP Contributors) - -### Running Code Style Checks - -We use [flake8](http://flake8.pycqa.org/en/latest/) and [black](https://black.readthedocs.io/) to ensure a consistent code style for all of our sample code in this repository. - -Run the following commands to validate your code against the linters: - -```sh -$ flake8 -$ black --check . -``` - -### Running Python Code Formatter - -We're using a tool called [black](https://black.readthedocs.io/) on this repo to ensure consistent formatting. On CI it runs in "check" mode to ensure any new files added to the repo follow PEP 8. If you see linter warnings that say something like "would reformat some_file.py" it means that black disagrees with your formatting. - -**The easiest way to resolve these errors is to run Black locally on the code and then commit those changes, as explained below.** - -To automatically re-format your code to be consistent with our code style guidelines, run [black](https://black.readthedocs.io/) in the repository root folder: - -```sh -$ black . -``` +# Using Python for Data Analysis + +This folder contains completed notebooks and other files used in the Real Python tutorial on [Using Python for Data Analysis](https://realpython.com/using-python-for-data-analysis/). + +None of the files are mandatory to complete the tutorial, however, you may find them of use for reference during the tutorial. + +## Available Files: + +`data analysis findings.ipynb` is a Jupyter Notebook containing all the code used in the tutorial. +`data analysis results.ipynb` is a Jupyter Notebook containing the final version of the cleansing and analysis code. +`james_bond_data.csv` contains the data to be cleansed and analyzed in its original form, in CSV format. +`james_bond_data.json` contains the data to be cleansed and analyzed in its original form, in JSON format. +`james_bond_data.parquet` contains the data to be cleansed and analyzed in its original form, in parquet format. +`james_bond_data.xlsx` contains the data to be cleansed and analyzed in its original form, in Microsoft Excel format. +`james_bond_data_cleansed.csv` contains the cleansed data in its final form. + +## Although the tutorial can be completed in a range of Python environments, the use of Jupyter Notebook within JupyterLab is highly recommended. \ No newline at end of file diff --git a/data analysis results.ipynb b/data analysis results.ipynb new file mode 100644 index 0000000000..fed90709d0 --- /dev/null +++ b/data analysis results.ipynb @@ -0,0 +1,222 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ade4bd3f-543b-460b-980f-0b41aab2c8b6", + "metadata": {}, + "source": [ + "# Data Cleansing Code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a360772e-7829-4c15-9af9-d4596efc7351", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install pandas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c98c7640-1472-4869-9fdd-f070d665ae1d", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "james_bond_data = pd.read_csv(\"james_bond_data.csv\").convert_dtypes()\n", + "\n", + "data = (\n", + " james_bond_data.rename(columns=new_column_names)\n", + " .combine_first(\n", + " pd.DataFrame(\n", + " {\"imdb_rating\": {10: 7.1}, \"rotten_tomatoes_rating\": {10: 6.8}}\n", + " )\n", + " )\n", + " .assign(\n", + " gross_income_usa=lambda data: (\n", + " data[\"gross_income_usa\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " gross_income_world=lambda data: (\n", + " data[\"gross_income_world\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " movie_budget=lambda data: (\n", + " data[\"movie_budget\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " * 1000\n", + " ),\n", + " film_length=lambda data: (\n", + " data[\"film_length\"]\n", + " .str.removesuffix(\"mins\")\n", + " .astype(int)\n", + " .replace(1200, 120)\n", + " ),\n", + " release_date=lambda data: pd.to_datetime(\n", + " data[\"release_date\"], format=\"%B, %Y\"\n", + " ),\n", + " release_Year=lambda data: data[\"release_date\"].dt.year,\n", + " bond_actor=lambda data: (\n", + " data[\"bond_actor\"]\n", + " .str.replace(\"Shawn\", \"Sean\")\n", + " .str.replace(\"MOORE\", \"Moore\")\n", + " ),\n", + " car_manufacturer=lambda data: data[\"car_manufacturer\"].str.replace(\n", + " \"Astin\", \"Aston\"\n", + " ),\n", + " martinis_consumed=lambda data: data[\"martinis_consumed\"].replace(\n", + " -6, 6\n", + " ),\n", + " )\n", + ").drop_duplicates(ignore_index=True)\n", + "\n", + "data.to_csv(\"james_bond_data_cleansed.csv\", index=False)" + ] + }, + { + "cell_type": "markdown", + "id": "f50918ee-e61f-46b2-b0c2-1ffa2c62bbc0", + "metadata": {}, + "source": [ + "# Data Analysis Code" + ] + }, + { + "cell_type": "markdown", + "id": "86817f68-05a0-4235-a1c8-a5d1f6e9141e", + "metadata": {}, + "source": [ + "## Performing a Regression Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bee6d6cb-e418-4c1d-8b75-604b9ab2e63d", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install matplotlib scikit-learn" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "669fb9d7-d744-4e6b-899e-a69aebec53ed", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import LinearRegression\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from sklearn.linear_model import LinearRegression\n", + "\n", + "x = data.loc[:, [\"imdb_rating\"]]\n", + "y = data.loc[:, \"rotten_tomatoes_rating\"]\n", + "\n", + "model = LinearRegression()\n", + "model.fit(x, y)\n", + "\n", + "r_squared = f\"R-Squared: {model.score(x, y):.2f}\"\n", + "best_fit = f\"y = {model.coef_[0]:.4f}x{model.intercept_:+.4f}\"\n", + "y_pred = model.predict(x)\n", + "\n", + "fig, ax = plt.subplots()\n", + "ax.scatter(x, y)\n", + "ax.plot(x, y_pred, color=\"red\")\n", + "ax.text(7.25, 5.5, r_squared, fontsize=10)\n", + "ax.text(7.25, 7, best_fit, fontsize=10)\n", + "ax.set_title(\"Scatter Plot of Ratings\")\n", + "ax.set_xlabel(\"Average IMDB Rating\")\n", + "ax.set_ylabel(\"Average Rotten Tomatoes Rating\")\n", + "# fig.show()" + ] + }, + { + "cell_type": "markdown", + "id": "b38df412-c320-49fb-93ae-e253405537a8", + "metadata": {}, + "source": [ + "## Investigating a Statistical Distribution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "938e5942-e57f-4e41-99f1-215cfb37d0df", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# fig, ax = plt.subplots()\n", + "length = data[\"film_length\"].value_counts(bins=7).sort_index()\n", + "length.plot.bar(\n", + " title=\"Film Length Distribution\",\n", + " xlabel=\"Time Range (mins)\",\n", + " ylabel=\"Count\",\n", + ")\n", + "# fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff4e9955-baf4-48eb-b032-fbf55f439194", + "metadata": {}, + "outputs": [], + "source": [ + "data[\"film_length\"].agg([\"mean\", \"max\", \"min\", \"std\"])" + ] + }, + { + "cell_type": "markdown", + "id": "1b14c433-c3a6-4484-bc0a-26825bd1e870", + "metadata": {}, + "source": [ + "## Finding No Relationship" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2bb83374-347f-4cf6-bc21-8180a003371d", + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax = plt.subplots()\n", + "ax.scatter(data[\"imdb_rating\"], data[\"bond_kills\"])\n", + "ax.set_title(\"Scatter Plot of Kills vs Ratings\")\n", + "ax.set_xlabel(\"Average IMDb Rating\")\n", + "ax.set_ylabel(\"Kills by Bond\")\n", + "fig.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/data_analysis_findings.ipynb b/data_analysis_findings.ipynb new file mode 100644 index 0000000000..e1a3e0f216 --- /dev/null +++ b/data_analysis_findings.ipynb @@ -0,0 +1,1093 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ade4bd3f-543b-460b-980f-0b41aab2c8b6", + "metadata": {}, + "source": [ + "# Acquiring Your Data" + ] + }, + { + "cell_type": "markdown", + "id": "83ad2114-5ed8-4a90-85fa-adea5eda4392", + "metadata": {}, + "source": [ + "## Reading Data From CSV Files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a360772e-7829-4c15-9af9-d4596efc7351", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install pandas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e52f486-232e-440b-8585-90416e4300c2", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "james_bond_data = pd.read_csv(\"james_bond_data.csv\").convert_dtypes()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "797f69eb-3108-45d3-9a67-58c43593abf1", + "metadata": {}, + "outputs": [], + "source": [ + "james_bond_data.head()" + ] + }, + { + "cell_type": "markdown", + "id": "e47c1f9b-b390-4035-956b-622615b57f32", + "metadata": {}, + "source": [ + "## Reading Data From Other Sources" + ] + }, + { + "cell_type": "markdown", + "id": "1d85aee9-cfeb-460b-9fe8-f3c7e7dfb764", + "metadata": {}, + "source": [ + "### Reading JSON" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7465cd11-dad4-4741-9372-f825b28c33d6", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "james_bond_data = pd.read_json(\"james_bond_data.json\").convert_dtypes()" + ] + }, + { + "cell_type": "markdown", + "id": "47a0e4a6-0ed9-4253-9833-0ad22c49b968", + "metadata": {}, + "source": [ + "### Reading Excel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0364b81-64a0-4098-89fc-e58bd6d68257", + "metadata": {}, + "outputs": [], + "source": [ + "! python -m pip install openpyxl" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8302139f-52dc-4f95-aa9a-96040ae5d82b", + "metadata": {}, + "outputs": [], + "source": [ + "import openpyxl\n", + "import pandas as pd\n", + "\n", + "james_bond_data = pd.read_excel(\"james_bond_data.xlsx\").convert_dtypes()" + ] + }, + { + "cell_type": "markdown", + "id": "be4a1143-c966-4056-8a5e-3bdebe2a9b1f", + "metadata": {}, + "source": [ + "### Reading Parquet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f36ef600-e6ba-4cc6-9ee3-0cbf369a4be2", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install pyarrow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c86284a2-9073-4240-b4d5-5e8b0373fc27", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "james_bond_data = pd.read_parquet(\n", + " \"james_bond_data.parquet\"\n", + ").convert_dtypes()" + ] + }, + { + "cell_type": "markdown", + "id": "69f884c2-92e8-4db3-bd63-84007f654808", + "metadata": {}, + "source": [ + "### Scraping HTML" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b902722d-9648-4124-80b0-64004342170d", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install lxml" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1fb2ff9c-3030-4f4a-be30-c2ab68452a21", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "james_bond_data_html = pd.read_html(\n", + " \"https://en.wikipedia.org/wiki/List_of_James_Bond_novels_and_short_stories\"\n", + ")\n", + "james_bond_data = james_bond_data_html[1].convert_dtypes()" + ] + }, + { + "cell_type": "markdown", + "id": "31068de2-9864-434a-9652-b115d1131684", + "metadata": {}, + "source": [ + "# Cleansing Your Data With Python" + ] + }, + { + "cell_type": "markdown", + "id": "e0dcca3b-6e71-481d-a071-6218012db962", + "metadata": {}, + "source": [ + "## Creating Meaningful Column Names" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d70997b9-3c75-4165-b034-8544bd084c04", + "metadata": {}, + "outputs": [], + "source": [ + "new_column_names = {\n", + " \"Release\": \"release_date\",\n", + " \"Movie\": \"movie_title\",\n", + " \"Bond\": \"bond_actor\",\n", + " \"Bond_Car_MFG\": \"car_manufacturer\",\n", + " \"US_Gross\": \"gross_income_usa\",\n", + " \"World_Gross\": \"gross_income_world\",\n", + " \"Budget ($ 000s)\": \"movie_budget\",\n", + " \"Film_Length\": \"film_length\",\n", + " \"Avg_User_IMDB\": \"imdb_rating\",\n", + " \"Avg_User_Rtn_Tom\": \"rotten_tomatoes_rating\",\n", + " \"Martinis\": \"martinis_consumed\",\n", + " \"Kills_Bond\": \"bond_kills\",\n", + "}\n", + "\n", + "data = james_bond_data.rename(columns=new_column_names)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "937b9121-b7ae-4f7e-800d-bfcc2689c98a", + "metadata": {}, + "outputs": [], + "source": [ + "data.columns" + ] + }, + { + "cell_type": "markdown", + "id": "e432b28e-257b-422b-b2f8-06f41608391b", + "metadata": {}, + "source": [ + "## Dealing With Missing Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d497e64c-aa7e-4d09-8de1-f529939d58f9", + "metadata": {}, + "outputs": [], + "source": [ + "data.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b29d5a34-c930-4ce2-898c-b9e8aa7f771d", + "metadata": {}, + "outputs": [], + "source": [ + "data.loc[data.isna().any(axis=\"columns\")]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1db9201a-11c1-4cdd-9625-d70cee736191", + "metadata": {}, + "outputs": [], + "source": [ + "data = james_bond_data.rename(columns=new_column_names).combine_first(\n", + " pd.DataFrame(\n", + " {\"imdb_rating\": {10: 7.1}, \"rotten_tomatoes_rating\": {10: 6.8}}\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49d06d77-49b0-4e89-b228-8583650595af", + "metadata": {}, + "outputs": [], + "source": [ + "pd.DataFrame({\"imdb_rating\": {10: 7.1}, \"rotten_tomatoes_rating\": {10: 6.8}})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f4af51fb-fd1f-4570-b16f-6f20e0b65473", + "metadata": {}, + "outputs": [], + "source": [ + "data.loc[data.isna().any(axis=\"columns\")]" + ] + }, + { + "cell_type": "markdown", + "id": "f6297c81-4c63-4eff-95e3-4a944bb5fe03", + "metadata": {}, + "source": [ + "## Correcting Invalid Data Types" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "916c91b8-7888-40fc-bce7-247837508adf", + "metadata": {}, + "outputs": [], + "source": [ + "data[\n", + " [\"gross_income_usa\", \"gross_income_world\", \"movie_budget\", \"film_length\"]\n", + "].head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "001996e3-2fce-4228-a873-b78eef613bba", + "metadata": {}, + "outputs": [], + "source": [ + "data[\n", + " [\"gross_income_usa\", \"gross_income_world\", \"movie_budget\", \"film_length\"]\n", + "].head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "880e4710-1c11-4de2-a2c3-97a9672ce6f7", + "metadata": {}, + "outputs": [], + "source": [ + "data = (\n", + " james_bond_data.rename(columns=new_column_names)\n", + " .combine_first(\n", + " pd.DataFrame(\n", + " {\"imdb_rating\": {10: 7.1}, \"rotten_tomatoes_rating\": {10: 6.8}}\n", + " )\n", + " )\n", + " .assign(\n", + " gross_income_usa=lambda data: (\n", + " data[\"gross_income_usa\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae9c1d1b-a620-43c5-a199-eb6a7bff7ce2", + "metadata": {}, + "outputs": [], + "source": [ + "data = (\n", + " james_bond_data.rename(columns=new_column_names)\n", + " .combine_first(\n", + " pd.DataFrame(\n", + " {\"imdb_rating\": {10: 7.1}, \"rotten_tomatoes_rating\": {10: 6.8}}\n", + " )\n", + " )\n", + " .assign(\n", + " gross_income_usa=lambda data: (\n", + " data[\"gross_income_usa\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " gross_income_world=lambda data: (\n", + " data[\"gross_income_world\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " movie_budget=lambda data: (\n", + " data[\"movie_budget\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8331b98e-169f-4d3b-9b88-0ece7ddc8dea", + "metadata": {}, + "outputs": [], + "source": [ + "data = (\n", + " james_bond_data.rename(columns=new_column_names)\n", + " .combine_first(\n", + " pd.DataFrame(\n", + " {\"imdb_rating\": {10: 7.1}, \"rotten_tomatoes_rating\": {10: 6.8}}\n", + " )\n", + " )\n", + " .assign(\n", + " gross_income_usa=lambda data: (\n", + " data[\"gross_income_usa\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " gross_income_world=lambda data: (\n", + " data[\"gross_income_world\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " movie_budget=lambda data: (\n", + " data[\"movie_budget\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " film_length=lambda data: (\n", + " data[\"film_length\"].str.removesuffix(\"mins\").astype(int)\n", + " ),\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f6340b1f-3b1c-42e6-9b69-e981f645d77b", + "metadata": {}, + "outputs": [], + "source": [ + "data[\n", + " [\"gross_income_usa\", \"gross_income_world\", \"movie_budget\", \"film_length\"]\n", + "].info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7025fbd2-ce44-4efe-88c9-9f51830776c2", + "metadata": {}, + "outputs": [], + "source": [ + "data[\n", + " [\"gross_income_usa\", \"gross_income_world\", \"movie_budget\", \"film_length\"]\n", + "].head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d45b9b42-7c71-422f-9ddb-ea659e5385c9", + "metadata": {}, + "outputs": [], + "source": [ + "data[[\"release_date\"]].info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f5dacf7-2f6c-47f4-b875-7d36f2251627", + "metadata": {}, + "outputs": [], + "source": [ + "data[[\"release_date\"]].head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed0ead0e-7310-4c82-86d5-2480a95f1525", + "metadata": {}, + "outputs": [], + "source": [ + "data = (\n", + " james_bond_data.rename(columns=new_column_names)\n", + " .combine_first(\n", + " pd.DataFrame(\n", + " {\"imdb_rating\": {10: 7.1}, \"rotten_tomatoes_rating\": {10: 6.8}}\n", + " )\n", + " )\n", + " .assign(\n", + " gross_income_usa=lambda data: (\n", + " data[\"gross_income_usa\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " gross_income_world=lambda data: (\n", + " data[\"gross_income_world\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " movie_budget=lambda data: (\n", + " data[\"movie_budget\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " film_length=lambda data: (\n", + " data[\"film_length\"].str.removesuffix(\"mins\").astype(int)\n", + " ),\n", + " release_date=lambda data: pd.to_datetime(\n", + " data[\"release_date\"], format=\"%B, %Y\"\n", + " ),\n", + " release_year=lambda data: data[\"release_date\"].dt.year,\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f147876-7348-43e9-ac6a-3f3df6ee2af9", + "metadata": {}, + "outputs": [], + "source": [ + "data[[\"release_date\", \"release_year\"]].head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47d4868a-94d8-4d36-85b9-b0c9a6203a8a", + "metadata": {}, + "outputs": [], + "source": [ + "data[[\"release_date\", \"release_year\"]].info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b2c7922a-916e-4e01-829b-77cbb2205153", + "metadata": {}, + "outputs": [], + "source": [ + "data.info()" + ] + }, + { + "cell_type": "markdown", + "id": "89653d81-3bcd-4078-83cb-ad4b2fa560e6", + "metadata": {}, + "source": [ + "## Fixing Inconsistencies in Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47a41ef3-751a-41ed-869d-9f2c45509196", + "metadata": {}, + "outputs": [], + "source": [ + "data[[\"gross_income_usa\", \"gross_income_world\", \"movie_budget\"]].head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc483320-7895-4368-a672-b98f8d0c9755", + "metadata": {}, + "outputs": [], + "source": [ + "data = (\n", + " james_bond_data.rename(columns=new_column_names)\n", + " .combine_first(\n", + " pd.DataFrame(\n", + " {\"imdb_rating\": {10: 7.1}, \"rotten_tomatoes_rating\": {10: 6.8}}\n", + " )\n", + " )\n", + " .assign(\n", + " gross_income_usa=lambda data: (\n", + " data[\"gross_income_usa\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " gross_income_world=lambda data: (\n", + " data[\"gross_income_world\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " movie_budget=lambda data: (\n", + " data[\"movie_budget\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " * 1000\n", + " ),\n", + " film_length=lambda data: (\n", + " data[\"film_length\"].str.removesuffix(\"mins\").astype(int)\n", + " ),\n", + " release_date=lambda data: pd.to_datetime(\n", + " data[\"release_date\"], format=\"%B, %Y\"\n", + " ),\n", + " release_Year=lambda data: data[\"release_date\"].dt.year,\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6cbd7ea-e168-442e-8dd9-e2955288fa57", + "metadata": {}, + "outputs": [], + "source": [ + "data[[\"gross_income_usa\", \"gross_income_world\", \"movie_budget\"]].head()" + ] + }, + { + "cell_type": "markdown", + "id": "8bdaa8b1-9f2e-46a5-b53a-c1ae4c201c99", + "metadata": {}, + "source": [ + "## Correcting Spelling Errors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e442e51a-28fd-42d7-94b0-aaf1abe5d9a8", + "metadata": {}, + "outputs": [], + "source": [ + "data[\"bond_actor\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f9863aa7-b5db-4ab1-be63-727ff437b63b", + "metadata": {}, + "outputs": [], + "source": [ + "data = (\n", + " james_bond_data.rename(columns=new_column_names)\n", + " .combine_first(\n", + " pd.DataFrame(\n", + " {\"imdb_rating\": {10: 7.1}, \"rotten_tomatoes_rating\": {10: 6.8}}\n", + " )\n", + " )\n", + " .assign(\n", + " gross_income_usa=lambda data: (\n", + " data[\"gross_income_usa\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " gross_income_world=lambda data: (\n", + " data[\"gross_income_world\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " movie_budget=lambda data: (\n", + " data[\"movie_budget\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " * 1000\n", + " ),\n", + " film_length=lambda data: (\n", + " data[\"film_length\"].str.removesuffix(\"mins\").astype(int)\n", + " ),\n", + " release_date=lambda data: pd.to_datetime(\n", + " data[\"release_date\"], format=\"%B, %Y\"\n", + " ),\n", + " release_Year=lambda data: data[\"release_date\"].dt.year,\n", + " bond_actor=lambda data: (\n", + " data[\"bond_actor\"]\n", + " .str.replace(\"Shawn\", \"Sean\")\n", + " .str.replace(\"MOORE\", \"Moore\")\n", + " ),\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e313152b-92b4-43a8-8483-637281a1f04d", + "metadata": {}, + "outputs": [], + "source": [ + "data[\"bond_actor\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a26b138d-72e5-4e15-a875-ee65023545d1", + "metadata": {}, + "outputs": [], + "source": [ + "data[\"car_manufacturer\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd4ae142-e339-4601-b0a4-84375eb28c02", + "metadata": {}, + "outputs": [], + "source": [ + "data = (\n", + " james_bond_data.rename(columns=new_column_names)\n", + " .combine_first(\n", + " pd.DataFrame(\n", + " {\"imdb_rating\": {10: 7.1}, \"rotten_tomatoes_rating\": {10: 6.8}}\n", + " )\n", + " )\n", + " .assign(\n", + " gross_income_usa=lambda data: (\n", + " data[\"gross_income_usa\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " gross_income_world=lambda data: (\n", + " data[\"gross_income_world\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " movie_budget=lambda data: (\n", + " data[\"movie_budget\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " * 1000\n", + " ),\n", + " film_length=lambda data: (\n", + " data[\"film_length\"].str.removesuffix(\"mins\").astype(int)\n", + " ),\n", + " release_date=lambda data: pd.to_datetime(\n", + " data[\"release_date\"], format=\"%B, %Y\"\n", + " ),\n", + " release_Year=lambda data: data[\"release_date\"].dt.year,\n", + " bond_actor=lambda data: (\n", + " data[\"bond_actor\"]\n", + " .str.replace(\"Shawn\", \"Sean\")\n", + " .str.replace(\"MOORE\", \"Moore\")\n", + " ),\n", + " car_manufacturer=lambda data: data[\"car_manufacturer\"].str.replace(\n", + " \"Astin\", \"Aston\"\n", + " ),\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c596022b-02a4-40c0-ac5f-d0b0643a7a4a", + "metadata": {}, + "outputs": [], + "source": [ + "data[\"car_manufacturer\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "50c80bc8-fdb9-4c28-af5a-cd6b66c7a01d", + "metadata": {}, + "source": [ + "## Checking For Invalid Outliers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8260f6b1-6d7f-4338-95b7-8946d69a92e2", + "metadata": {}, + "outputs": [], + "source": [ + "data[[\"film_length\", \"martinis_consumed\"]].describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c73fe06b-5f42-4357-9b0f-2e460bf0dacf", + "metadata": {}, + "outputs": [], + "source": [ + "data = (\n", + " james_bond_data.rename(columns=new_column_names)\n", + " .combine_first(\n", + " pd.DataFrame(\n", + " {\"imdb_rating\": {10: 7.1}, \"rotten_tomatoes_rating\": {10: 6.8}}\n", + " )\n", + " )\n", + " .assign(\n", + " gross_income_usa=lambda data: (\n", + " data[\"gross_income_usa\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " gross_income_world=lambda data: (\n", + " data[\"gross_income_world\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " movie_budget=lambda data: (\n", + " data[\"movie_budget\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " * 1000\n", + " ),\n", + " film_length=lambda data: (\n", + " data[\"film_length\"]\n", + " .str.removesuffix(\"mins\")\n", + " .astype(int)\n", + " .replace(1200, 120)\n", + " ),\n", + " release_date=lambda data: pd.to_datetime(\n", + " data[\"release_date\"], format=\"%B, %Y\"\n", + " ),\n", + " release_Year=lambda data: data[\"release_date\"].dt.year,\n", + " bond_actor=lambda data: (\n", + " data[\"bond_actor\"]\n", + " .str.replace(\"Shawn\", \"Sean\")\n", + " .str.replace(\"MOORE\", \"Moore\")\n", + " ),\n", + " car_manufacturer=lambda data: data[\"car_manufacturer\"].str.replace(\n", + " \"Astin\", \"Aston\"\n", + " ),\n", + " martinis_consumed=lambda data: data[\"martinis_consumed\"].replace(\n", + " -6, 6\n", + " ),\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2abb5b80-48be-4a00-9483-4732b9a5d802", + "metadata": {}, + "outputs": [], + "source": [ + "data[[\"film_length\", \"martinis_consumed\"]].describe()" + ] + }, + { + "cell_type": "markdown", + "id": "3e129b32-5e66-41cb-b938-8fd58bb94116", + "metadata": {}, + "source": [ + "## Removing Duplicate Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be7aad8b-ef3f-48a6-a9a0-de909133921f", + "metadata": {}, + "outputs": [], + "source": [ + "data.loc[data.duplicated(keep=False)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c98c7640-1472-4869-9fdd-f070d665ae1d", + "metadata": {}, + "outputs": [], + "source": [ + "data = (\n", + " james_bond_data.rename(columns=new_column_names)\n", + " .combine_first(\n", + " pd.DataFrame(\n", + " {\"imdb_rating\": {10: 7.1}, \"rotten_tomatoes_rating\": {10: 6.8}}\n", + " )\n", + " )\n", + " .assign(\n", + " gross_income_usa=lambda data: (\n", + " data[\"gross_income_usa\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " gross_income_world=lambda data: (\n", + " data[\"gross_income_world\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " movie_budget=lambda data: (\n", + " data[\"movie_budget\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " * 1000\n", + " ),\n", + " film_length=lambda data: (\n", + " data[\"film_length\"]\n", + " .str.removesuffix(\"mins\")\n", + " .astype(int)\n", + " .replace(1200, 120)\n", + " ),\n", + " release_date=lambda data: pd.to_datetime(\n", + " data[\"release_date\"], format=\"%B, %Y\"\n", + " ),\n", + " release_Year=lambda data: data[\"release_date\"].dt.year,\n", + " bond_actor=lambda data: (\n", + " data[\"bond_actor\"]\n", + " .str.replace(\"Shawn\", \"Sean\")\n", + " .str.replace(\"MOORE\", \"Moore\")\n", + " ),\n", + " car_manufacturer=lambda data: data[\"car_manufacturer\"].str.replace(\n", + " \"Astin\", \"Aston\"\n", + " ),\n", + " martinis_consumed=lambda data: data[\"martinis_consumed\"].replace(\n", + " -6, 6\n", + " ),\n", + " )\n", + ").drop_duplicates(ignore_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff05e0ae-4f9b-47a7-87f1-fb7630fabddc", + "metadata": {}, + "outputs": [], + "source": [ + "data.loc[data.duplicated(keep=False)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1216a25-4791-4601-83ba-62513e4cc880", + "metadata": {}, + "outputs": [], + "source": [ + "data[\"movie_title\"].value_counts().head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ba17e3f-3ce1-4885-a104-f60d254d9feb", + "metadata": {}, + "outputs": [], + "source": [ + " data[\"bond_actor\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "52db1351-36ed-4104-a999-345ebbc62214", + "metadata": {}, + "source": [ + "## Storing Your Cleansed Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "575a774e-6913-41fb-8ff9-4d786f478007", + "metadata": {}, + "outputs": [], + "source": [ + "data.to_csv(\"james_bond_data_cleansed.csv\", index=False)" + ] + }, + { + "cell_type": "markdown", + "id": "f50918ee-e61f-46b2-b0c2-1ffa2c62bbc0", + "metadata": {}, + "source": [ + "# Using Python for Data Analysis" + ] + }, + { + "cell_type": "markdown", + "id": "86817f68-05a0-4235-a1c8-a5d1f6e9141e", + "metadata": {}, + "source": [ + "## Performing a Regression Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bee6d6cb-e418-4c1d-8b75-604b9ab2e63d", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install matplotlib scikit-learn" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27d0a3dd-e71a-4b8a-883c-40cb5c001f7e", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "fig, ax = plt.subplots()\n", + "ax.scatter(data[\"imdb_rating\"], data[\"rotten_tomatoes_rating\"])\n", + "ax.set_title(\"Scatter Plot of Ratings\")\n", + "ax.set_xlabel(\"Average IMDB Rating\")\n", + "ax.set_ylabel(\"Average Rotten Tomatoes Rating\")\n", + "# fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "669fb9d7-d744-4e6b-899e-a69aebec53ed", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import LinearRegression\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from sklearn.linear_model import LinearRegression\n", + "\n", + "x = data.loc[:, [\"imdb_rating\"]]\n", + "y = data.loc[:, \"rotten_tomatoes_rating\"]\n", + "\n", + "model = LinearRegression()\n", + "model.fit(x, y)\n", + "\n", + "r_squared = f\"R-Squared: {model.score(x, y):.2f}\"\n", + "best_fit = f\"y = {model.coef_[0]:.4f}x{model.intercept_:+.4f}\"\n", + "y_pred = model.predict(x)\n", + "\n", + "fig, ax = plt.subplots()\n", + "ax.scatter(x, y)\n", + "ax.plot(x, y_pred, color=\"red\")\n", + "ax.text(7.25, 5.5, r_squared, fontsize=10)\n", + "ax.text(7.25, 7, best_fit, fontsize=10)\n", + "ax.set_title(\"Scatter Plot of Ratings\")\n", + "ax.set_xlabel(\"Average IMDB Rating\")\n", + "ax.set_ylabel(\"Average Rotten Tomatoes Rating\")\n", + "# fig.show()" + ] + }, + { + "cell_type": "markdown", + "id": "b38df412-c320-49fb-93ae-e253405537a8", + "metadata": {}, + "source": [ + "## Investigating a Statistical Distribution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "938e5942-e57f-4e41-99f1-215cfb37d0df", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# fig, ax = plt.subplots()\n", + "length = data[\"film_length\"].value_counts(bins=7).sort_index()\n", + "length.plot.bar(\n", + " title=\"Film Length Distribution\",\n", + " xlabel=\"Time Range (mins)\",\n", + " ylabel=\"Count\",\n", + ")\n", + "# fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff4e9955-baf4-48eb-b032-fbf55f439194", + "metadata": {}, + "outputs": [], + "source": [ + "data[\"film_length\"].agg([\"mean\", \"max\", \"min\", \"std\"])" + ] + }, + { + "cell_type": "markdown", + "id": "1b14c433-c3a6-4484-bc0a-26825bd1e870", + "metadata": {}, + "source": [ + "## Finding No Relationship" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2bb83374-347f-4cf6-bc21-8180a003371d", + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax = plt.subplots()\n", + "ax.scatter(data[\"imdb_rating\"], data[\"bond_kills\"])\n", + "ax.set_title(\"Scatter Plot of Kills vs Ratings\")\n", + "ax.set_xlabel(\"Average IMDb Rating\")\n", + "ax.set_ylabel(\"Kills by Bond\")\n", + "fig.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/james_bond_data.csv b/james_bond_data.csv new file mode 100644 index 0000000000..4a983b2201 --- /dev/null +++ b/james_bond_data.csv @@ -0,0 +1,28 @@ +Release,Movie,Bond,Bond_Car_MFG,US_Gross,World_Gross,Budget ($ 000s),Film_Length,Avg_User_IMDB,Avg_User_Rtn_Tom,Martinis,Kills_Bond +"June, 1962",Dr. No,Sean Connery,Sunbeam," $16,067,035.00 "," $59,567,035.00 "," $1,000.00 ",110 mins,7.3,7.7,2,4 +"August, 1963",From Russia with Love,Sean Connery,Bently," $24,800,000.00 "," $78,900,000.00 "," $2,000.00 ",115 mins,7.5,8,0,11 +"May, 1964",Goldfinger,Sean Connery,Aston Martin," $51,100,000.00 "," $124,900,000.00 "," $3,000.00 ",110 mins,7.8,8.4,1,9 +"September, 1965",Thunderball,Sean Connery,Aston Martin," $63,600,000.00 "," $141,200,000.00 "," $9,000.00 ",130 mins,7,6.8,0,20 +"November, 1967",You Only Live Twice,Sean Connery,Toyota," $43,100,000.00 "," $111,600,000.00 "," $9,500.00 ",117 mins,6.9,6.3,1,21 +"July, 1969",On Her Majesty's Secret Service,George Lazenby,Mercury," $22,800,000.00 "," $82,000,000.00 "," $8,000.00 ",142 mins,6.8,6.7,1,5 +"March, 1971",Diamonds Are Forever,Shawn Connery,Ford," $43,800,000.00 "," $116,000,000.00 "," $7,200.00 ",1200 mins,6.7,6.3,0,7 +"August, 1973",Live and Let Die,Roger Moore,AMC," $35,400,000.00 "," $161,800,000.00 "," $7,000.00 ",121 mins,6.8,5.9,0,8 +"July, 1974",The Man with the Golden Gun,Roger Moore,AMC," $21,000,000.00 "," $97,600,000.00 "," $7,000.00 ",125 mins,6.7,5.1,0,1 +"July, 1974",The Man with the Golden Gun,Roger Moore,AMC," $21,000,000.00 "," $97,600,000.00 "," $7,000.00 ",125 mins,6.7,5.1,0,1 +"April, 1977",The Spy Who Loved Me,Roger Moore,Lotus," $46,800,000.00 "," $185,400,000.00 "," $14,000.00 ",125 mins,,,1,31 +"October, 1979",Moonraker,Roger Moore,Lotus," $70,300,000.00 "," $210,300,000.00 "," $31,000.00 ",126 mins,6.2,5.7,1,12 +"June, 1981",For Your Eyes Only,Roger MOORE,Citroen," $54,800,000.00 "," $195,300,000.00 "," $28,000.00 ",127 mins,6.8,6.3,0,18 +"March, 1983",Octopussy,Roger Moore,Bajaj," $67,900,000.00 "," $187,500,000.00 "," $27,500.00 ",131 mins,6.5,5.3,0,15 +"October, 1985",A View to a Kill,Roger Moore,Rolls Royce," $50,327,960.00 "," $152,627,960.00 "," $30,000.00 ",131 mins,6.2,4.7,0,5 +"May, 1987",The Living Daylights,Timothy Dalton,Rolls Royce," $51,185,000.00 "," $191,200,000.00 "," $40,000.00 ",130 mins,6.7,6.3,2,13 +"May, 1987",The Living Daylights,Timothy Dalton,Rolls Royce," $51,185,000.00 "," $191,200,000.00 "," $40,000.00 ",130 mins,6.7,6.3,2,13 +"January, 1989",License to Kill,Timothy Dalton,Aston Martin," $34,667,015.00 "," $156,167,015.00 "," $42,000.00 ",133 mins,6.5,6,1,10 +"September, 1995",GoldenEye,Pierce Brosnan,BMW," $106,429,941.00 "," $356,429,941.00 "," $60,000.00 ",130 mins,7.2,6.9,1,47 +"July, 1997",Tomorrow Never Dies,Pierce Brosnan,Aston Martin," $125,304,276.00 "," $339,504,276.00 "," $110,000.00 ",119 mins,6.4,6,1,30 +"June, 1999",The World Is Not Enough,Pierce Brosnan,BMW," $126,930,660.00 "," $361,730,660.00 "," $135,000.00 ",128 mins,6.3,5.7,1,27 +"August, 2002",Die Another Day,Pierce Brosnan,Aston Martin," $160,942,139.00 "," $431,942,139.00 "," $142,000.00 ",133 mins,6,6.1,2,31 +"February, 2006",Casino Royale,Daniel Craig,Astin Martin," $167,365,000.00 "," $596,365,000.00 "," $102,000.00 ",144 mins,7.9,7.8,3,11 +"December, 2008",Quantum of Solace,Daniel Craig,Aston Martin," $169,368,427.00 "," $591,692,078.00 "," $230,000.00 ",106 mins,6.7,6.1,-6,16 +"November, 2012",Skyfall,Daniel Craig,Astin Martin," $304,360,277.00 "," $1,108,561,108.00 "," $200,000.00 ",143 mins,7.8,8.2,1,26 +"September, 2015",Spectre,Daniel Craig,Aston Martin," $200,074,175.00 "," $879,620,923.00 "," $245,000.00 ",148 mins,6.8,6.4,1,30 +"November, 2021",No Time to Die,Daniel Craig,Aston Martin," $160,891,007.00 "," $759,959,662.00 "," $275,000.00 ",163 mins,7.3,7.3,1,14 diff --git a/james_bond_data.json b/james_bond_data.json new file mode 100644 index 0000000000..852810b38e --- /dev/null +++ b/james_bond_data.json @@ -0,0 +1 @@ +{"Release":{"0":"June, 1962","1":"August, 1963","2":"May, 1964","3":"September, 1965","4":"November, 1967","5":"July, 1969","6":"March, 1971","7":"August, 1973","8":"July, 1974","9":"July, 1974","10":"April, 1977","11":"October, 1979","12":"June, 1981","13":"March, 1983","14":"October, 1985","15":"May, 1987","16":"May, 1987","17":"January, 1989","18":"September, 1995","19":"July, 1997","20":"June, 1999","21":"August, 2002","22":"February, 2006","23":"December, 2008","24":"November, 2012","25":"September, 2015","26":"November, 2021"},"Movie":{"0":"Dr. No","1":"From Russia with Love","2":"Goldfinger","3":"Thunderball","4":"You Only Live Twice","5":"On Her Majesty's Secret Service","6":"Diamonds Are Forever","7":"Live and Let Die","8":"The Man with the Golden Gun","9":"The Man with the Golden Gun","10":"The Spy Who Loved Me","11":"Moonraker","12":"For Your Eyes Only","13":"Octopussy","14":"A View to a Kill","15":"The Living Daylights","16":"The Living Daylights","17":"License to Kill","18":"GoldenEye","19":"Tomorrow Never Dies","20":"The World Is Not Enough","21":"Die Another Day","22":"Casino Royale","23":"Quantum of Solace","24":"Skyfall","25":"Spectre","26":"No Time to Die"},"Bond":{"0":"Sean Connery","1":"Sean Connery","2":"Sean Connery","3":"Sean Connery","4":"Sean Connery","5":"George Lazenby","6":"Shawn Connery","7":"Roger Moore","8":"Roger Moore","9":"Roger Moore","10":"Roger Moore","11":"Roger Moore","12":"Roger MOORE","13":"Roger Moore","14":"Roger Moore","15":"Timothy Dalton","16":"Timothy Dalton","17":"Timothy Dalton","18":"Pierce Brosnan","19":"Pierce Brosnan","20":"Pierce Brosnan","21":"Pierce Brosnan","22":"Daniel Craig","23":"Daniel Craig","24":"Daniel Craig","25":"Daniel Craig","26":"Daniel Craig"},"Bond_Car_MFG":{"0":"Sunbeam","1":"Bently","2":"Aston Martin","3":"Aston Martin","4":"Toyota","5":"Mercury","6":"Ford","7":"AMC","8":"AMC","9":"AMC","10":"Lotus","11":"Lotus","12":"Citroen","13":"Bajaj","14":"Rolls Royce","15":"Rolls Royce","16":"Rolls Royce","17":"Aston Martin","18":"BMW","19":"Aston Martin","20":"BMW","21":"Aston Martin","22":"Astin Martin","23":"Aston Martin","24":"Astin Martin","25":"Aston Martin","26":"Aston Martin"},"US_Gross":{"0":" $16,067,035.00 ","1":" $24,800,000.00 ","2":" $51,100,000.00 ","3":" $63,600,000.00 ","4":" $43,100,000.00 ","5":" $22,800,000.00 ","6":" $43,800,000.00 ","7":" $35,400,000.00 ","8":" $21,000,000.00 ","9":" $21,000,000.00 ","10":" $46,800,000.00 ","11":" $70,300,000.00 ","12":" $54,800,000.00 ","13":" $67,900,000.00 ","14":" $50,327,960.00 ","15":" $51,185,000.00 ","16":" $51,185,000.00 ","17":" $34,667,015.00 ","18":" $106,429,941.00 ","19":" $125,304,276.00 ","20":" $126,930,660.00 ","21":" $160,942,139.00 ","22":" $167,365,000.00 ","23":" $169,368,427.00 ","24":" $304,360,277.00 ","25":" $200,074,175.00 ","26":" $160,891,007.00 "},"World_Gross":{"0":" $59,567,035.00 ","1":" $78,900,000.00 ","2":" $124,900,000.00 ","3":" $141,200,000.00 ","4":" $111,600,000.00 ","5":" $82,000,000.00 ","6":" $116,000,000.00 ","7":" $161,800,000.00 ","8":" $97,600,000.00 ","9":" $97,600,000.00 ","10":" $185,400,000.00 ","11":" $210,300,000.00 ","12":" $195,300,000.00 ","13":" $187,500,000.00 ","14":" $152,627,960.00 ","15":" $191,200,000.00 ","16":" $191,200,000.00 ","17":" $156,167,015.00 ","18":" $356,429,941.00 ","19":" $339,504,276.00 ","20":" $361,730,660.00 ","21":" $431,942,139.00 ","22":" $596,365,000.00 ","23":" $591,692,078.00 ","24":" $1,108,561,108.00 ","25":" $879,620,923.00 ","26":" $759,959,662.00 "},"Budget ($ 000s)":{"0":" $1,000.00 ","1":" $2,000.00 ","2":" $3,000.00 ","3":" $9,000.00 ","4":" $9,500.00 ","5":" $8,000.00 ","6":" $7,200.00 ","7":" $7,000.00 ","8":" $7,000.00 ","9":" $7,000.00 ","10":" $14,000.00 ","11":" $31,000.00 ","12":" $28,000.00 ","13":" $27,500.00 ","14":" $30,000.00 ","15":" $40,000.00 ","16":" $40,000.00 ","17":" $42,000.00 ","18":" $60,000.00 ","19":" $110,000.00 ","20":" $135,000.00 ","21":" $142,000.00 ","22":" $102,000.00 ","23":" $230,000.00 ","24":" $200,000.00 ","25":" $245,000.00 ","26":" $275,000.00 "},"Film_Length":{"0":"110 mins","1":"115 mins","2":"110 mins","3":"130 mins","4":"117 mins","5":"142 mins","6":"1200 mins","7":"121 mins","8":"125 mins","9":"125 mins","10":"125 mins","11":"126 mins","12":"127 mins","13":"131 mins","14":"131 mins","15":"130 mins","16":"130 mins","17":"133 mins","18":"130 mins","19":"119 mins","20":"128 mins","21":"133 mins","22":"144 mins","23":"106 mins","24":"143 mins","25":"148 mins","26":"163 mins"},"Avg_User_IMDB":{"0":7.3,"1":7.5,"2":7.8,"3":7.0,"4":6.9,"5":6.8,"6":6.7,"7":6.8,"8":6.7,"9":6.7,"10":null,"11":6.2,"12":6.8,"13":6.5,"14":6.2,"15":6.7,"16":6.7,"17":6.5,"18":7.2,"19":6.4,"20":6.3,"21":6.0,"22":7.9,"23":6.7,"24":7.8,"25":6.8,"26":7.3},"Avg_User_Rtn_Tom":{"0":7.7,"1":8.0,"2":8.4,"3":6.8,"4":6.3,"5":6.7,"6":6.3,"7":5.9,"8":5.1,"9":5.1,"10":null,"11":5.7,"12":6.3,"13":5.3,"14":4.7,"15":6.3,"16":6.3,"17":6.0,"18":6.9,"19":6.0,"20":5.7,"21":6.1,"22":7.8,"23":6.1,"24":8.2,"25":6.4,"26":7.3},"Martinis":{"0":2,"1":0,"2":1,"3":0,"4":1,"5":1,"6":0,"7":0,"8":0,"9":0,"10":1,"11":1,"12":0,"13":0,"14":0,"15":2,"16":2,"17":1,"18":1,"19":1,"20":1,"21":2,"22":3,"23":-6,"24":1,"25":1,"26":1},"Kills_Bond":{"0":4,"1":11,"2":9,"3":20,"4":21,"5":5,"6":7,"7":8,"8":1,"9":1,"10":31,"11":12,"12":18,"13":15,"14":5,"15":13,"16":13,"17":10,"18":47,"19":30,"20":27,"21":31,"22":11,"23":16,"24":26,"25":30,"26":14}} \ No newline at end of file diff --git a/james_bond_data.parquet b/james_bond_data.parquet new file mode 100644 index 0000000000000000000000000000000000000000..88bd22b4fb36adc606eaf6eacd5b46d56121e5d1 GIT binary patch literal 11050 zcmcgyZEPDydY+Y*HffWwBx@DYk?cK1W)x>7?FYZsa9k}Zks?D%7A29CF0HU6R}`h? zE{`9Q=rl!8^om?AIQRnR;5!tDG{}#jz4S+s25s6O*Zw#J2M6en^iZHd5geLJa43qP zD1stro4zxoBub?6f!HQIyEE_1ydTdy^Ugaf-%yeQLo=_QVt(5jXG8`KAoRoDr_UgS zVs%5~SV8uQc4#T}LcLnE^bJckXeHE!#o_UsgX)r2u4$#LRiYTDcUdi))DQOCMV$Op(gs2a;2zSCRo1%rY+=ZrlUmrW!t3f&*^|b zO`=5qx9Na%brR75@5tYxfF(W{Q;oV>vHX}0$icgamc6!h+uKNXykjfTvOM`W$S(4{ zNd31xtYs^87VT1QrRUsWh*iUNa}0CSCFd`us6ia9mx zL*wTTOygJ>>2;nZ3S&(Ce$aSW?}4X}ve?+?l^0~|<-f{Q|Ksn+AA94|EYRI=T80_B z%ZpC+xtynyny#r;jp-Tk4SjHW=w}x9OSIeikJ-<;j3R>kXw3avPMqaCA zRb4+1%~i9`E*N@)jTd(`c6qm$(=Gvd!C>dK3Y$>3v}&#KSe0GUauuxxugVS#8H6D- zMYUua`6{baG&XEjv>mO|2Xsr6SB*RyhtV@d?GZ%P`!rx=*zVYCn;J{qc4`J2sT+fi zOU|WogH3OmmbYf|Y(gVT%ruRPx~)|Ps1G=^!h(JkcDA8aEpJkfmhoySgNY3+NMTb& zZI`W?tja!B)b;+SH^B%nl$}uqxQ{$xRI;I$i4t3|_PlV(HJ zwSFKh)>WfcFR|tZyJYHWPCE_6rR~NBfK94pEmx~(&F#W27fTw}9yoVdhUdF@#@Tn= zX5jxN>+!vFhwIUjFn2&0UX8EYhCZ>nUwO#=Z)DMbMQL5vRJNKuU8s0 z42JEN@1-Wj6Cn$<;%2R0r5<&LinWTV8Ps4rsBWoSO=NYwidSe(qaizMx(N|&>l+Pe8@qS@?8*~Qa2ycbf8;uZ$&O)K52=;)L^wj= z`Wb=iMW_4M42J6n+pg$*cAB|E?+3U4?TzCN&W;H_j`#UFUh+=zJZl>q6Fppj=Q*C| zU4wS7zzNp}k=-Y8K6uj}DLjs7Q4HT8b?A}2oaZ{UM1gvqqFpc29$)wxvio^Xx_%AO zUJp#Lg8(@=j)s9KLe)2tr#^wB$G3-A$;0`ueBm;O`X8Sj<9(b*lsVZWP=D%R#zZeC z@g7d}`z9IW^l`Gpb3R+aab?WM!yu6pBzfe@AaIa;-u`F%udp)I0>H!Xxo>s19|*B!KTr0>dA(hYqx^ zEUiaiUsZ|oeqkH>*y8@xMfXoHVCQ`$ab2DIBP((ZIeusl zfWuvn+n7F4z&>V>Rgk@ySHRDCffwr)IUn)zm?B$D-4?!D{4E3L?0Z?5`M_n z;o+UUhETwF5n275IC&8vfB!|=jx2ky7a$Hu%=c_8Ee3onY-E3b2^ta)4dl7rhnQd5 zc;sVP+JP+3A>n#Iq9t#C(lP1^IrmAf##ii>Z4a)8`^&NN#}>J{nPJ3ulp}14k11ltmg}m1dWfeV0ra%vbEkiJ@i?x z`!?r(^KlIG$)dU<`^#SDJveFI1lbQM&I&RGfoDraqiVAwugi|4Nhjdva?&2rVn;UW z8ZGi8mmvUJT$U&jN20Xo8*z?Aku<{7lH?kr1=(c>bm8Ig00Vwx8F@yoLE)V~iLma1 zkyU8*Ijs}AYo4UX!!Q+fvoLUKYE6bVRU{g9HMhJT+~bJam^^yHVHESMGyU9 zuls%8{qZD5rkjASn0!Dnw*Zr2f=L-k7Rov@jVu&>aoTc@mM%H^?8DPP`7uSIM;uQ3 zBh&BRh3W-|6*(Q_@b2s9k;{P#JLZIO_;+dgxW!KEevcDl4nODgP|tW5Ds&C&_>^ zdjD^Lf`e0BPHm+3HtxLrz8fFb?oYkL%$e2&!J+IF)>o=pWj&gh2@)LsvUlh=J?@`- z+_$9RuOYT8Hm~$Be+Sr%5o~TzG`8J(k)Xp)+xo2RG{6P8SiyX!{Q`l>`tbAy5C+qtk5dr$udg1W)^~tfvSzG9B2R$(tas=qkfF)$cl-cc zryT6BPAcjO+zloWXU-7V#_1lo>>)%s&Ry#TP4Mrt&znCm`bg60wC;~^TKC5c!CLO5 z0+Yk^^ZC)s-@8rU8ha6rB=?)whZ(xHJgm(mu-H77f6z1Z8Rh=LlkT^#fq|VoPmt8r zz<=yv{sC6T6=LG+6iq{@Kxb{I;SH-{5DEju0_8LmD-;UKB`9_%15nOEaX>i(WeUng zD0n6HIjq+%5R?$YnLqw@Zl|ec2Xa~gTJpo%cT%CNOBz#zkh^s2d=h%yw?l_`ox#_@ z>iR*_{wh6mr`P?jiu-?l2N3JzeuB(yh`HYPBIl`UZc{6%j1w{T%RL9z`^zXk45|=X z^NQ6@A$N#&!jWvbz2hJYYbUm!)a~@+$4h%(dvM*Sp8JDDHkVfG--lLr%n zhkiOc68Hx(zg_jaF3n z@;S<}qlQ}2fMC>h9a_~Id|!bkPwHkFs&FmDbPd;qXMYzOO@`})yVl}+l+6ZwfXc5Q zr8V_huQke8-e}z@=FEIkg?g#nIOs65Tkts$t5(u#YF@3WF#MTO&4A(X6%)^0Sq@JG zMxSj{=BO58a*yi?dd_1??c|A^IH zW_$0wmA(J{rBiNu&%Kq|_6eXg8O zWp{KRlfI#wYR%`t6OAL$q^+YZoOEp81OBb6(-&yaHsT&Dv45@U6yDx*OlZ;QYWD&8 zi>yAp7j%j|+V>5Gl2fCisWr1gg1KJIR@6!Z$MSMR#eZ+Y3cw$nszuYlRKYXJPYRcm5*~o4+8j6+FbgH@*mg}o&LC52=Myi?(0Usj>dVy>~=Cw?a^hJ$ST}{vLtVLFe zST>BU<>vH_{G7f^_=aLTtEIBODlInq(6`*Gf9HlN$#%gO;>Qqr3P48fBa%w9dBK@_sOmY)=@T&)Y+>_%WjP0S!#gbfKYmV>mZ*)7hlPxvZ#=npKoNP`I{%{mrBiVB&FC+PD(sbf2Ksaf!BJw z>@Mp+la*q4FVE#l^37a>td~|yns48uLbjxDWyR!v%n`DgptpTZX2sZd*3$EQ69+-h zN5T--lA8zdVUDjx=i1^)C>F}6lg3(RvG7!Av5=Tic5{twf>9SePaON8)?>1ySF>ba3del z>uToYeGX^Xq1?`Mu$(o%9*#>QtjW3gjW3SfwQ@GHcQeC(aSZ>ay0!9%##%beXQgQC z?A|M{LavfCw#m5#=RVY9JNagu5n=t^RMT>uoaDkKe$L znf?4tQT#WgsVUrCj1sOmkvmNFU=~i@W{=>gD2bamX~YAAWKwA6iDr8?1kzv`J$`7oAq3wHqDIf7}q#|++_b3T} zOTVb1-+(%?FF};TiEg`nGnhKtw$FHdJ~v&VJ0!PGc^T`U@%EYIs7E5 zkfsvHuVb-N*~Rjww)X7_H|-%4+VH_x2D`<1C4lEQwp#YgHs=Qq{YN+{RXqRbdA6(M zVA53T20V!UE5!Z||6&H=!?T?5o5k{;Q1E=rxMeRvqRn2i{*LD$Wd0nPzk6&xPWs#N zBKDK{WKyVr$BW1GjgUT~I;oKbGTY1{F`S}ApjatE zKA#TXTwBB&WwE8-lTRWiTUIOIscW@~T)8|!E?jKuLd7<(TCThBx0)~ghyK%r&<*%s Gvi}c^zcEVy literal 0 HcmV?d00001 diff --git a/james_bond_data.xlsx b/james_bond_data.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..1e042705fc38619b41b62bd98b1c1a807c061cc0 GIT binary patch literal 11269 zcmeIY1zQ|j);8QofW}>eLvRi5!QI`1ySqzp2m}opg1fuBy9Em#92$3d`<$8i&P>jH zzu=kb?%G{_?R#BaweO{SDat@WV*+3RZ~y>+6kv3kX`v4R0K`E902lx`NF5P-I~P+s z7XwvK2UBM~Mh{yXk{oDA>TCccc>e#6|HDt9EOAu6ml;*;R`N+?he3L^MhK4mBxn$w zR!N|%FJYj>SSQQU@--{s5mop*mKA3;`sAuN>-o4vjh$^nQ20QrGGbKVP?wrM0Vh-c z;4x(<4qm*IwiYNGlR$_C&%hwYEFJLGv9VW~O?2~vbQ!({7PdgZcINV+I@Ucbe%rBK3<*FpQEl6A6huDEDRWQ zvV;OjgJu>1iq=te*yXHtlfHd|Sh=#JXFVi8A1OC6-uA5%n6zZY#+k_DlG)O4=H&D?joh|4LjKwI^gv_p8BN| ze0rXI;LrL30|30fLID*2LoMson8~leT$2TB9RgS_4V+AEoS7K^IR8%_{}1cnzukIS zyqrQWGh!%6@+oZScK&A^s<4dv2gz1aRlfkKRh0UeTnd7pom6j8Rq=zNB>da`o<>%F z^2Hnuk=|^vl|`Xr@{%{Wmxrc2IJ&~pzH>?zcP!iNLvx$Ip1(9$MANnSGcrneP81x!?dv zFq~WVcqSEV$l1tzslsQ-hV=R#Uroh=+p@|i%ZZ1~)4;^0>rynm6XV&3K|W(hnT!qn zj&)LEkTUz)U%QU&d^FRuj}zEiI&?N16h&OL2KG>YSCU{9iLi9Ag1~`Q2nGCPJZzZU z?VYTR?Cq`p@LPGRathnbsO^8Z?4wkPh%3cif9|-DXJJhq`D7-2 zfotE~y%!8xpIDF~Ty! zsiS`KlMm@*7tRw5`kpM)eWzK?x;L{xl4d>!>6;MQ!)C$#2&XH5ZLH-X`|FJmz398+ z2FQMD^Jx>xq=G2Kp0zl}Ws4>e4@rPXDK}78LgH7DrRm&E7NE``K_K!Qv;ZkA&Eng= z4P!|{gG;JSt0wdOUWFR?Tg^>6Twq7ExR7>|I+^4e)pU!u9RnYiPGt1@j~TH#YQO1K zHQHY4+yVV}Jr@)M{LJ%d#BEciEVN=UYwC;ku2q7Iqn;(*0m~@wN9*&F4X7mwM=$XMc zNLr{p;b$yQplZU6E-T@e`%BBk(*!rpE3d-dtRDyN;`*IpE&8BCr@52sux%MuuRuc= zToy5z^wY@WY~$6yNTTM7gM#xo5?Zkf>22;sy{z{Do8#rB_6_SxT3GX(@!+@! zZ4o#aBFWV0R4#mLQ@PGR#Qo7AW!1yOFWHWc7C5w7m~0c31WYmc;{a^sjX_DO&8(E5t`1G z3;NM3klD`g;QTsxrT^U$AYM*Nex5=F|I6H$*IRiO9w;;8Zz`#5Lq?HouJ$btF0KVS!MVu&rY1R;b(a@ z6Q~8L-1qqIv?wSHg5p^2@`jah5cv~vVX z$`JMm%#J7%x1#%*m#;B+I1)4Yucge6-?SEo90I6RFFND5y1yi47`FTOgi2^GGNxCU zW=(_kL5CZ>$Z{@L%gw|*nLFqTTuo%ykjR`@DbYM|Zg>dfy-a7QUn6sdPyoP&b+A z8peW|f@1s@Wdm9q1FLlyP~7?F0{(eQOk1%#;y@MiryTMoRB7*<%g{f+Q`=1tgX)gb{gF+h8r3-$CPBr^jx7l zTRPN;DWE9X6(s3iRZE{QdoNX1g&UtGqT4&4D)rQ2Gd)aCkZ;rzI&c9_wBh&-nJF@4 z>N!mr_dMK=DtVkrjF704=HQTXN%L}Z0}L-Yaalr1{6%_Bmp4df!Dn{)Lp{y#r!*(}d^(ud zC$fa1=#DA&La+Rcqk41rY}<47;KDsqH+JJw3B}1>oqG|p=;_Hht?%n&Pat?}Q_u@> z4jg+<-drTvTsRp&fzl^@r95-;$U63%rVMI`r%vkb$#C>BZlw|0wM(=?3V7ik)M7Ko zA==)3NYfWLkM|59??oNA2eOm*1F)t5fVhc?IOK|szd{OPl90kVT>J!r|7XD!27 z0)C9Y*pM#dCT5N=$DT+4aCryu6&AHiJjQz5+N=9f~>r7A`t)7Lpu=$57u4o`bx1cC`CCI!YXR*TgkCz)JLz)j3BsX@1K5 zb|*C6TiejiDli7$a?i{pny!rXg62czg>g*6bW&Yzu0! zjJ(;~dj0ft%{m&)tEbxZvYMW@_=^SIUl8dRBHB9a$~N;;668xdeE7}F5XM_afnUVI zD^QfZqJkWahRbgBQGj1?Ojlu+$>8N{+5tjdkih zrgHlf_cCpv|+g2ewuU$ZSgw!Ffv7o7JpHOrOKoMDOwXJ!4{>{o#_Xf@e0W6 z(fWipDM|Mg6;F4?=e51mLxhro67I&_392zjwAjbkR4lYA`j=@|0^%A{vt)IU1X`si zBr(L##PTwA+L$EFS$ZKn`M~3o&A3qLYxZG7XX&=l4Hgw;rcbhOEoSn7-bSCD+naB`HYP+hr*8l#S+fQDiw zTiULDIdT)-0l{1!e&p7gY3C|F32?v-Do{^m9oM#RU(TAL4?O-Mwwx*5uIzkH`CyH} z%DHml)~+>V6oDW2tgX{ksexZ#t_Eu_gl%BZ^dUATztHrK(WVy zLtDr??8fWOHJ-`P(r26wC;9>E(k1g)4OA(Z0?!-`Y7^+71khj1$M{20f#_&;5zHzuz;vbY3XQO?E@0*sZ$4~X?CNr{#vI8fKnaf|{bEbD{7VZZlxM5S=3R2!`db|>L2pjk;c?p=dAIK?MmVcJB zylF8SqgLH^@AvTHvF)DTk%`E}p!C2yHPG~EJm*>=M?H#yoosD+tT=^vy>DryPnp!V z-Ft(bw&Q7F)3W1YnZTD9O;`jZ=gdDypnQw1gn}&xWna*=G?hqoR!@D3%!s{s{2{0` zwn!QoJwqf`kDc6r9$}_hlskoaCDru3PdrB5IG5D-agKzKdXtC^M3QB0o+c?jMrt>s zqf;@B-}}qsm0LoZn>M)pI{B(rt|~LWTcXaMld8N|#G!)c(^P~{KT!tMg9}=FQ??p_ zrZ@CFy%jG{-nm35%`)BjVsk*Pf=f6LlkxSYfKo=DS#P)~{4_sJJMZ>&xZ=<~k4=rgv3iBjJ?~EZ=uZ)_ z(>TGd!}$s>sr)^A4oL5TBZ2||u3!NGq<^?2XBST!Q|CX+hH5KzE6j+##8v*NSEnOx z+w?#i961%JH!UB9cnvEf8K{I^SxoYsd5+g5?&F&*8WERd$du-$Nl~N*HDWOLQ&ZEd zlQuSb2E&VWadd6bpH5gV&2Z9bHG^);1~d)^Q5);F5cF8yP~~&PYcHlD71Q-vD7DHc$4%W9iYiw)gjL1ecLqpWWw|A#SZgVb{?K)3 zwXbH5zu&UUt@~~bp_*^ujpf{`F)Hbt-l-1G;O%#Vqq~pA@5wv?UPx2gVfN$e9LKX0|M*ikzlOsos@}|N*L-eQJQu3gqKf|54 z)IxI}3!8MKn2zlog03rJaxZr<1hnzO#j2K2V#7^Z%_N|YPx`pDjtKpe?mER?6hV=Z zsB`;vkJGihG(@e%XR`ZKxuZ~Hy7f)0sS-oVpTI{CF4;53=Ts~osy*1s$|tQAY_jgb z>-M84!^1VhnD)x=UpI+Q5EafQSAG@VA95hHr*>v|CdgoWURpF?or>;tuwR=8$K6`2ro~-+Z=|kKHpwp9XVdsCX zz+-mRYITDH%2a#+0P~-UWbx6-)I`O_$cqGb#^-KyU98!IH`qLZ4I0$;7MY!rq z+*Qs%JX7KllBNF@8Ff9pVdr=6%ptU_Se+Asd2CW;>~&`M^z1&UO#tm4B7kQTz0^U7 zeC^aLT@t&K4D`65R+B<{Dq0X}SBtK=3T08j?xjW?uFBIzoKjZr$w1736DUA`lIfqg zrQ)ItKCj;0Hh$k>`X)Dx5M}(%LzDO_xppK)QBR_Dv!IAOe6iDBI=jwh%*Jsaop32`@;_UQ!h8m(W08^ z%N>eYbsdi6{C|mk$I)~2FM~!vosgP)(ZM-{(;K}xh*k=m$VZhuDKk&HaWA8Zc%y)|&TSsu-Xm!g(K+2rLzp zI%+Q%v8{kR*rW}>*GUY7=Grnx&W=imn%9A}2IFTH^?h+nUbamI#PM(UGd;Jur+MG` z-+$=~OT^iRx_~|2IJ~OE*dQ)7ial$~3?T^g`v@QK6k%EW_#qchhc^2P?@U8NyaLw( zR|MmM69WbNy?m;+I`&5%ASq7(YE3~i|Ca>rFA1W^!iuAEKgnnsu7YOKMOAZE`ls?$ z5#I*qqY^wEC!#dXCYx%60+e48Bdq(+sg_$vdHbOzI=>SY=M%}v1gZ#Q1k8lk%GpsW z(xypEeiJL3z05+wi4E%e<^?Ap8kzx!JS#;?(T-(cophEwxuPBbgUUyEJrqsoL|PLq1NS_{pUr*53w;KylQsrQ;7tSzE_gVM3tbjlG6NQ@7Puw&_n3Emfq24=~jIfc}V5h%0=_tesXj}kWPg6V$`<$ z?&CUKqRz(JD2syu!F;EqLnC5lcckc)TgagK>|>VLASp1O8s!37UACPQlEq~3728qc z*_3R^wr*6Jmh#)QtF5i|?Se*6w;13()rUm%Z$n#N8RzY9c#&=szje}jHiuj+kFx17 zUVZ3g9m>8HRzJF#7+H!#13G16ZT0K3;7-pyV#CZHEnkQ(l?o90w6#BNA!fppRJ2Uh zf!@JFl{s1Pz1<^nmG8>CTkkoA=`=yHn$*3!zb;^N7lt{ zj&p8bm)5A%H1VEv{MP@#C6AMWw^-To3PBbwpPrduaa}ZLcQXBFyU_pksKCB3I$qB@m>Dw^1p9~*?;WG0 zzL1T5ql8<=tT{MvD(qxs`Yx0r{d%3r>WF@&pK<(g_nbS-q%^oJn2EW;O7*iOW-b8J zUNnE?ZM)5nO?LP)W9h0ErMO&VdTp!}9K0WioG@9kx!q0hHXBkS$`#Ia?(!ml(Ppl zSgTSv#)p3TM6os?lg4P_b-nibaO}s#MKqM#^%^0QMsMDSk&W9nNi22_j?4cZxC3Ds zNTR^x!v<3m_20?&(ZS(=+yj&DpGQXgI5>wv{RDdRfD8#@`Beu8A8e#Z!Qf<}1f?E- znXIiPO0B$M2i>Y*pe$_Dd{W%_jE5+QTx^HZPA4915?tBmr4|X<4ve{el;&G7s1%3@ z!u5e-^eF2?lUhBz1EV&&hKEgtsmM@4+9GJ zJejU-V;m)jfE8aibY{;$>_2Y8yZ4t%H)JK|pC2TJuvUYG(O4mJE#7HX`WDu$1(jh} z*{Op86_Omq1RE?r{7aegkTC6s#r;?Q#B) zFwMBUgk%ZvQp_lYPQ`i)H@W%>L8`j& zwcUvE*eT)lreRmPKZSSvQ7cpWFnY&$w~txZeRkvRTby_fP(bzzom1P2Vcv#5qo+t!P-re1yhvcs2Q3x5U75ReSu z;P}5cT>fjp{x$xW_De;X{}k|_tzrKH{xPP2wc_6z#eN6=-Y)YOv<+OM{jGWCckqAq zDEtKl06qc#1pohZEc`C#_l@hnBsC)Ze}Cd{+t|NL`Ms|EmlS<)8UntR->b~O3;4YP z@RxvVa4G{9@K-(Hcj)gq&tFghqJKbt&wzfH@SiU9FFXLiLID8$&7uAd|4$?M@9>Iu c{|5i3NmP`1172MK03Q4b0Q+ovnm?}o9|Ua)9RL6T literal 0 HcmV?d00001 diff --git a/james_bond_data_cleansed.csv b/james_bond_data_cleansed.csv new file mode 100644 index 0000000000..f57c20a4d8 --- /dev/null +++ b/james_bond_data_cleansed.csv @@ -0,0 +1,26 @@ +bond_actor,bond_kills,car_manufacturer,film_length,gross_income_usa,gross_income_world,imdb_rating,martinis_consumed,movie_budget,movie_title,release_date,rotten_tomatoes_rating,release_Year +Sean Connery,4,Sunbeam,110,16067035.0,59567035.0,7.3,2,1000000.0,Dr. No,1962-06-01,7.7,1962 +Sean Connery,11,Bently,115,24800000.0,78900000.0,7.5,0,2000000.0,From Russia with Love,1963-08-01,8.0,1963 +Sean Connery,9,Aston Martin,110,51100000.0,124900000.0,7.8,1,3000000.0,Goldfinger,1964-05-01,8.4,1964 +Sean Connery,20,Aston Martin,130,63600000.0,141200000.0,7.0,0,9000000.0,Thunderball,1965-09-01,6.8,1965 +Sean Connery,21,Toyota,117,43100000.0,111600000.0,6.9,1,9500000.0,You Only Live Twice,1967-11-01,6.3,1967 +George Lazenby,5,Mercury,142,22800000.0,82000000.0,6.8,1,8000000.0,On Her Majesty's Secret Service,1969-07-01,6.7,1969 +Sean Connery,7,Ford,120,43800000.0,116000000.0,6.7,0,7200000.0,Diamonds Are Forever,1971-03-01,6.3,1971 +Roger Moore,8,AMC,121,35400000.0,161800000.0,6.8,0,7000000.0,Live and Let Die,1973-08-01,5.9,1973 +Roger Moore,1,AMC,125,21000000.0,97600000.0,6.7,0,7000000.0,The Man with the Golden Gun,1974-07-01,5.1,1974 +Roger Moore,31,Lotus,125,46800000.0,185400000.0,7.1,1,14000000.0,The Spy Who Loved Me,1977-04-01,6.8,1977 +Roger Moore,12,Lotus,126,70300000.0,210300000.0,6.2,1,31000000.0,Moonraker,1979-10-01,5.7,1979 +Roger Moore,18,Citroen,127,54800000.0,195300000.0,6.8,0,28000000.0,For Your Eyes Only,1981-06-01,6.3,1981 +Roger Moore,15,Bajaj,131,67900000.0,187500000.0,6.5,0,27500000.0,Octopussy,1983-03-01,5.3,1983 +Roger Moore,5,Rolls Royce,131,50327960.0,152627960.0,6.2,0,30000000.0,A View to a Kill,1985-10-01,4.7,1985 +Timothy Dalton,13,Rolls Royce,130,51185000.0,191200000.0,6.7,2,40000000.0,The Living Daylights,1987-05-01,6.3,1987 +Timothy Dalton,10,Aston Martin,133,34667015.0,156167015.0,6.5,1,42000000.0,License to Kill,1989-01-01,6.0,1989 +Pierce Brosnan,47,BMW,130,106429941.0,356429941.0,7.2,1,60000000.0,GoldenEye,1995-09-01,6.9,1995 +Pierce Brosnan,30,Aston Martin,119,125304276.0,339504276.0,6.4,1,110000000.0,Tomorrow Never Dies,1997-07-01,6.0,1997 +Pierce Brosnan,27,BMW,128,126930660.0,361730660.0,6.3,1,135000000.0,The World Is Not Enough,1999-06-01,5.7,1999 +Pierce Brosnan,31,Aston Martin,133,160942139.0,431942139.0,6.0,2,142000000.0,Die Another Day,2002-08-01,6.1,2002 +Daniel Craig,11,Aston Martin,144,167365000.0,596365000.0,7.9,3,102000000.0,Casino Royale,2006-02-01,7.8,2006 +Daniel Craig,16,Aston Martin,106,169368427.0,591692078.0,6.7,6,230000000.0,Quantum of Solace,2008-12-01,6.1,2008 +Daniel Craig,26,Aston Martin,143,304360277.0,1108561108.0,7.8,1,200000000.0,Skyfall,2012-11-01,8.2,2012 +Daniel Craig,30,Aston Martin,148,200074175.0,879620923.0,6.8,1,245000000.0,Spectre,2015-09-01,6.4,2015 +Daniel Craig,14,Aston Martin,163,160891007.0,759959662.0,7.3,1,275000000.0,No Time to Die,2021-11-01,7.3,2021