data/notebooks/Lab/Excel_validation_logic.ipynb
{
"cells": [
{
"cell_type": "markdown",
"id": "7ab1c5b0",
"metadata": {},
"source": [
"# Input data Excel validation. \n",
"This notebook contains the steps and main logic to validate the input data from the user's excel. \n",
"The input Excel is generated with custom pre-validations to ensure that it complies with the expected data, and is also subject to a geocoding validation. The validation shown here is focused on checking that the fields in the Excel are filled correctly (in terms of required fields and expected values), prior to data ingestion.\n",
"\n",
"It is structured in two steps: \n",
"1) By-column check: required fills, correct formats, etc... \n",
"2) By-row check: correct/incorrect combinations of location information"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8db292db",
"metadata": {},
"outputs": [],
"source": [
"!pip install pandera --user"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "d9b8f9dd",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import pandera as pa\n",
"from pandera.typing import Series\n",
"import re"
]
},
{
"cell_type": "markdown",
"id": "716595c8",
"metadata": {},
"source": [
"## Read data. \n",
"- Read sheet from Google sheets \n",
"- Get correct field names, all in lowercase, replace spacing and symbols\n",
"- Remove rows with usage notes"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "4cb7eda0",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>2</th>\n",
" <th>material</th>\n",
" <th>business_unit</th>\n",
" <th>tier_1_supplier</th>\n",
" <th>producer</th>\n",
" <th>location_type</th>\n",
" <th>country</th>\n",
" <th>address</th>\n",
" <th>latitude</th>\n",
" <th>longitude</th>\n",
" <th>2010_tons</th>\n",
" <th>...</th>\n",
" <th>2015_tons</th>\n",
" <th>2016_tons</th>\n",
" <th>2017_tons</th>\n",
" <th>2018_tons</th>\n",
" <th>2019_tons</th>\n",
" <th>2020_tons</th>\n",
" <th>...</th>\n",
" <th>data_source</th>\n",
" <th>comments</th>\n",
" <th>...</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>10.05 Maize (corn)</td>\n",
" <td>Accessories</td>\n",
" <td>Cargill</td>\n",
" <td>Moll</td>\n",
" <td>Unknown</td>\n",
" <td>Lebanon</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2400</td>\n",
" <td>...</td>\n",
" <td>2522</td>\n",
" <td>2547</td>\n",
" <td>2572</td>\n",
" <td>2598</td>\n",
" <td>2624</td>\n",
" <td>2650</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>aoeuijj</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>10.05 Maize (corn)</td>\n",
" <td>Accessories</td>\n",
" <td>NaN</td>\n",
" <td>Moll</td>\n",
" <td>Unknown</td>\n",
" <td>Malaysia</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1300</td>\n",
" <td>...</td>\n",
" <td>1366</td>\n",
" <td>1380</td>\n",
" <td>1394</td>\n",
" <td>1408</td>\n",
" <td>1422</td>\n",
" <td>1436</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>09.01 Coffee, whether or not roasted or decaff...</td>\n",
" <td>Accessories</td>\n",
" <td>NaN</td>\n",
" <td>Moll</td>\n",
" <td>Unknown</td>\n",
" <td>United States of America</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1000</td>\n",
" <td>...</td>\n",
" <td>1050</td>\n",
" <td>1061</td>\n",
" <td>1072</td>\n",
" <td>1083</td>\n",
" <td>1094</td>\n",
" <td>1105</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>08.03 Bananas, including plantains; fresh or d...</td>\n",
" <td>Accessories</td>\n",
" <td>NaN</td>\n",
" <td>Moll</td>\n",
" <td>Unknown</td>\n",
" <td>Japan</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>730</td>\n",
" <td>...</td>\n",
" <td>767</td>\n",
" <td>775</td>\n",
" <td>783</td>\n",
" <td>791</td>\n",
" <td>799</td>\n",
" <td>807</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>40 Rubber and articles thereof</td>\n",
" <td>Accessories</td>\n",
" <td>NaN</td>\n",
" <td>Moll</td>\n",
" <td>Unknown</td>\n",
" <td>India</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>490</td>\n",
" <td>...</td>\n",
" <td>515</td>\n",
" <td>520</td>\n",
" <td>525</td>\n",
" <td>530</td>\n",
" <td>535</td>\n",
" <td>540</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 24 columns</p>\n",
"</div>"
],
"text/plain": [
"2 material business_unit \\\n",
"3 10.05 Maize (corn) Accessories \n",
"4 10.05 Maize (corn) Accessories \n",
"5 09.01 Coffee, whether or not roasted or decaff... Accessories \n",
"6 08.03 Bananas, including plantains; fresh or d... Accessories \n",
"7 40 Rubber and articles thereof Accessories \n",
"\n",
"2 tier_1_supplier producer location_type country address \\\n",
"3 Cargill Moll Unknown Lebanon NaN \n",
"4 NaN Moll Unknown Malaysia NaN \n",
"5 NaN Moll Unknown United States of America NaN \n",
"6 NaN Moll Unknown Japan NaN \n",
"7 NaN Moll Unknown India NaN \n",
"\n",
"2 latitude longitude 2010_tons ... 2015_tons 2016_tons 2017_tons 2018_tons \\\n",
"3 NaN NaN 2400 ... 2522 2547 2572 2598 \n",
"4 NaN NaN 1300 ... 1366 1380 1394 1408 \n",
"5 NaN NaN 1000 ... 1050 1061 1072 1083 \n",
"6 NaN NaN 730 ... 767 775 783 791 \n",
"7 NaN NaN 490 ... 515 520 525 530 \n",
"\n",
"2 2019_tons 2020_tons ... data_source comments ... \n",
"3 2624 2650 NaN NaN aoeuijj NaN \n",
"4 1422 1436 NaN NaN NaN NaN \n",
"5 1094 1105 NaN NaN NaN NaN \n",
"6 799 807 NaN NaN NaN NaN \n",
"7 535 540 NaN NaN NaN NaN \n",
"\n",
"[5 rows x 24 columns]"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sheet_url = \"https://docs.google.com/spreadsheets/d/16sQlhPXGaFpDPi_QWDsUCTZVJQMl9C8z6_KFJoBUR1Y/edit#gid=0\"\n",
"url = sheet_url.replace('/edit#gid=', '/export?format=csv&gid=')\n",
"df = pd.read_csv(url)\n",
"\n",
"df.columns = df.iloc[2].str.lower().str.strip().str.replace(' ', '_')\n",
"df = df.rename(columns = lambda x: re.sub('_\\(º[en]\\)', '', x))\n",
"df = df.drop([0, 1, 2])\n",
"\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"id": "2b126ccd",
"metadata": {},
"source": [
"## Validate fields\n",
"\n",
"Create schema to validate the main fields: \n",
"- material, bussiness_unit, location_type, country and tonnage are **required** and must be filled \n",
"- tonnage is an integer (or coerce into it) greater than 0 \n",
"- latitude and longitude as float and within especific ranges (lat: -90 to 90; long: -180 to 180)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "653d2445",
"metadata": {},
"outputs": [],
"source": [
"class data_validation(pa.SchemaModel):\n",
" material: Series[str] = pa.Field(str_matches= \"[A-Za-z]*\", allow_duplicates=True, nullable=False)\n",
" business_unit: Series[str] = pa.Field(str_matches= \"[A-Za-z]*\", allow_duplicates=True, nullable=False)\n",
" location_type: Series[str] = pa.Field(str_matches= \"[A-Za-z]*\", allow_duplicates=True, nullable=False)\n",
" country: Series[str] = pa.Field(str_matches= \"[A-Za-z]*\", allow_duplicates=True, nullable=False)\n",
" tons: Series[int] = pa.Field(alias ='(.*_tons)', nullable=False, allow_duplicates=True, regex=True, coerce=True, in_range={\"min_value\": 0, \"max_value\": np.iinfo(np.int32).max})\n",
" latitude: Series[float] = pa.Field(nullable=True, allow_duplicates=True, coerce=True, in_range={\"min_value\": -90, \"max_value\": 90})\n",
" longitude: Series[float] = pa.Field(nullable=True, allow_duplicates=True, coerce=True, in_range={\"min_value\": -180, \"max_value\": 180})\n",
"\n",
" \n",
" \n",
" "
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "c1a5f381",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>2</th>\n",
" <th>material</th>\n",
" <th>business_unit</th>\n",
" <th>tier_1_supplier</th>\n",
" <th>producer</th>\n",
" <th>location_type</th>\n",
" <th>country</th>\n",
" <th>address</th>\n",
" <th>latitude</th>\n",
" <th>longitude</th>\n",
" <th>2010_tons</th>\n",
" <th>...</th>\n",
" <th>2015_tons</th>\n",
" <th>2016_tons</th>\n",
" <th>2017_tons</th>\n",
" <th>2018_tons</th>\n",
" <th>2019_tons</th>\n",
" <th>2020_tons</th>\n",
" <th>...</th>\n",
" <th>data_source</th>\n",
" <th>comments</th>\n",
" <th>...</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>10.05 Maize (corn)</td>\n",
" <td>Accessories</td>\n",
" <td>Cargill</td>\n",
" <td>Moll</td>\n",
" <td>Unknown</td>\n",
" <td>Lebanon</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2400</td>\n",
" <td>...</td>\n",
" <td>2522</td>\n",
" <td>2547</td>\n",
" <td>2572</td>\n",
" <td>2598</td>\n",
" <td>2624</td>\n",
" <td>2650</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>aoeuijj</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>10.05 Maize (corn)</td>\n",
" <td>Accessories</td>\n",
" <td>NaN</td>\n",
" <td>Moll</td>\n",
" <td>Unknown</td>\n",
" <td>Malaysia</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1300</td>\n",
" <td>...</td>\n",
" <td>1366</td>\n",
" <td>1380</td>\n",
" <td>1394</td>\n",
" <td>1408</td>\n",
" <td>1422</td>\n",
" <td>1436</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>09.01 Coffee, whether or not roasted or decaff...</td>\n",
" <td>Accessories</td>\n",
" <td>NaN</td>\n",
" <td>Moll</td>\n",
" <td>Unknown</td>\n",
" <td>United States of America</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1000</td>\n",
" <td>...</td>\n",
" <td>1050</td>\n",
" <td>1061</td>\n",
" <td>1072</td>\n",
" <td>1083</td>\n",
" <td>1094</td>\n",
" <td>1105</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>08.03 Bananas, including plantains; fresh or d...</td>\n",
" <td>Accessories</td>\n",
" <td>NaN</td>\n",
" <td>Moll</td>\n",
" <td>Unknown</td>\n",
" <td>Japan</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>730</td>\n",
" <td>...</td>\n",
" <td>767</td>\n",
" <td>775</td>\n",
" <td>783</td>\n",
" <td>791</td>\n",
" <td>799</td>\n",
" <td>807</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>40 Rubber and articles thereof</td>\n",
" <td>Accessories</td>\n",
" <td>NaN</td>\n",
" <td>Moll</td>\n",
" <td>Unknown</td>\n",
" <td>India</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>490</td>\n",
" <td>...</td>\n",
" <td>515</td>\n",
" <td>520</td>\n",
" <td>525</td>\n",
" <td>530</td>\n",
" <td>535</td>\n",
" <td>540</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 24 columns</p>\n",
"</div>"
],
"text/plain": [
"2 material business_unit \\\n",
"3 10.05 Maize (corn) Accessories \n",
"4 10.05 Maize (corn) Accessories \n",
"5 09.01 Coffee, whether or not roasted or decaff... Accessories \n",
"6 08.03 Bananas, including plantains; fresh or d... Accessories \n",
"7 40 Rubber and articles thereof Accessories \n",
"\n",
"2 tier_1_supplier producer location_type country address \\\n",
"3 Cargill Moll Unknown Lebanon NaN \n",
"4 NaN Moll Unknown Malaysia NaN \n",
"5 NaN Moll Unknown United States of America NaN \n",
"6 NaN Moll Unknown Japan NaN \n",
"7 NaN Moll Unknown India NaN \n",
"\n",
"2 latitude longitude 2010_tons ... 2015_tons 2016_tons 2017_tons \\\n",
"3 NaN NaN 2400 ... 2522 2547 2572 \n",
"4 NaN NaN 1300 ... 1366 1380 1394 \n",
"5 NaN NaN 1000 ... 1050 1061 1072 \n",
"6 NaN NaN 730 ... 767 775 783 \n",
"7 NaN NaN 490 ... 515 520 525 \n",
"\n",
"2 2018_tons 2019_tons 2020_tons ... data_source comments ... \n",
"3 2598 2624 2650 NaN NaN aoeuijj NaN \n",
"4 1408 1422 1436 NaN NaN NaN NaN \n",
"5 1083 1094 1105 NaN NaN NaN NaN \n",
"6 791 799 807 NaN NaN NaN NaN \n",
"7 530 535 540 NaN NaN NaN NaN \n",
"\n",
"[5 rows x 24 columns]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_validation.validate(df).head()"
]
},
{
"cell_type": "markdown",
"id": "5412d82b",
"metadata": {},
"source": [
"## Locations validation. \n",
"\n",
"Check if location data is in the correct format, according to the following logic: \n",
"(note: Country info and coordinates are validated in the previous step) \n",
"\n",
"- if location_type is Unknown, it should not contain info on address and coordinates (only country)\n",
"- if location_type is Country of production or Origin country, also no info on address and coordinates\n",
"- if location type is Point of production, Aggregation point or Origin suplier facility, it MUST contain address or coordinates\n",
"\n",
"The result is a log of the outcomes (correct or type of error) for each entry."
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "c56f19b5",
"metadata": {},
"outputs": [],
"source": [
"def location_validation(df):\n",
" for l in range(len(df)):\n",
" if 'country' in df.iloc[l]['location_type'].lower():\n",
" if not pd.isna(df.iloc[l]['address']) or not pd.isna(df.iloc[l]['latitude']) or not pd.isna(df.iloc[l]['longitude']):\n",
" print(f'Location entry {l+1}: WARNING location type can be updated')\n",
" else:\n",
" e=0 \n",
" \n",
" if 'unknown' in df.iloc[l]['location_type'].lower():\n",
" if not pd.isna(df.iloc[l]['address']) or not pd.isna(df.iloc[l]['latitude']) or not pd.isna(df.iloc[l]['longitude']):\n",
" print(f'Location entry {l+1}: WARNING location type can be updated')\n",
" else:\n",
" e=0 \n",
" \n",
" if 'point' in df.iloc[l]['location_type'].lower():\n",
" if pd.isna(df.iloc[l]['address']):\n",
" if pd.isna(df.iloc[l]['latitude']) or pd.isna(df.iloc[l]['longitude']):\n",
" print(f'LOCATION ERROR ON ENTRY {l+1}: address or latitude/longitude REQUIRED')\n",
" else:\n",
" e=0\n",
" else:\n",
" e=0 \n",
" if 'facility' in df.iloc[l]['location_type'].lower():\n",
" if pd.isna(df.iloc[l]['address']):\n",
" if pd.isna(df.iloc[l]['latitude']) or pd.isna(df.iloc[l]['longitude']):\n",
" print(f'LOCATION ERROR ON ENTRY {l+1}: address or latitude/longitude REQUIRED') \n",
" else:\n",
" e=0\n",
" else:\n",
" e=0 \n",
" \n",
" if e == 0:\n",
" print(f'Location entry {l+1}: OK') "
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "7f94d91d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Location entry 1: OK\n",
"Location entry 2: OK\n",
"Location entry 3: OK\n",
"Location entry 4: OK\n",
"Location entry 5: OK\n",
"Location entry 6: OK\n",
"Location entry 7: OK\n",
"Location entry 8: OK\n",
"Location entry 9: OK\n",
"Location entry 10: OK\n",
"Location entry 11: OK\n",
"Location entry 12: OK\n",
"Location entry 13: OK\n",
"Location entry 14: OK\n",
"Location entry 15: OK\n",
"Location entry 16: OK\n",
"Location entry 17: OK\n",
"Location entry 18: OK\n",
"Location entry 19: OK\n",
"Location entry 20: OK\n",
"Location entry 21: OK\n",
"Location entry 22: OK\n",
"Location entry 23: OK\n",
"Location entry 24: OK\n",
"Location entry 25: OK\n",
"Location entry 26: OK\n",
"Location entry 27: OK\n",
"Location entry 28: OK\n",
"Location entry 29: OK\n",
"Location entry 30: OK\n",
"Location entry 31: OK\n",
"Location entry 32: OK\n",
"Location entry 33: OK\n",
"Location entry 34: OK\n",
"Location entry 35: OK\n",
"Location entry 36: OK\n",
"Location entry 37: OK\n",
"Location entry 38: OK\n",
"Location entry 39: OK\n",
"Location entry 40: OK\n",
"Location entry 41: OK\n",
"Location entry 42: OK\n",
"Location entry 43: OK\n",
"Location entry 44: OK\n",
"Location entry 45: OK\n",
"Location entry 46: OK\n",
"Location entry 47: OK\n",
"Location entry 48: OK\n",
"Location entry 49: OK\n",
"Location entry 50: OK\n",
"Location entry 51: OK\n",
"Location entry 52: OK\n",
"Location entry 53: OK\n",
"Location entry 54: OK\n",
"Location entry 55: OK\n",
"Location entry 56: OK\n",
"Location entry 57: OK\n",
"Location entry 58: OK\n",
"Location entry 59: OK\n",
"Location entry 60: OK\n",
"Location entry 61: OK\n",
"Location entry 62: OK\n",
"Location entry 63: OK\n",
"Location entry 64: OK\n",
"Location entry 65: OK\n",
"Location entry 66: OK\n",
"Location entry 67: OK\n",
"Location entry 68: OK\n",
"Location entry 69: OK\n",
"Location entry 70: OK\n",
"Location entry 71: OK\n",
"Location entry 72: OK\n",
"Location entry 73: OK\n",
"Location entry 74: OK\n",
"Location entry 75: OK\n"
]
}
],
"source": [
"location_validation(df)"
]
},
{
"cell_type": "markdown",
"id": "301921df",
"metadata": {},
"source": [
"## Change some location data to check error detection. "
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "5f0c98b3",
"metadata": {},
"outputs": [],
"source": [
"df_invalid = df.copy()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "4c505da4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Location entry 1: OK\n",
"Location entry 2: WARNING location type can be updated\n",
"Location entry 2: OK\n",
"Location entry 3: OK\n",
"Location entry 4: OK\n",
"Location entry 5: OK\n",
"Location entry 6: OK\n",
"Location entry 7: OK\n",
"Location entry 8: OK\n",
"Location entry 9: OK\n",
"Location entry 10: OK\n",
"Location entry 11: OK\n",
"Location entry 12: OK\n",
"Location entry 13: OK\n",
"Location entry 14: OK\n",
"LOCATION ERROR ON ENTRY 15: address or latitude/longitude REQUIRED\n",
"Location entry 15: OK\n",
"Location entry 16: OK\n",
"Location entry 17: OK\n",
"Location entry 18: OK\n",
"Location entry 19: OK\n",
"Location entry 20: OK\n",
"LOCATION ERROR ON ENTRY 21: address or latitude/longitude REQUIRED\n",
"Location entry 21: OK\n",
"Location entry 22: OK\n",
"Location entry 23: OK\n",
"Location entry 24: OK\n",
"Location entry 25: OK\n",
"Location entry 26: OK\n",
"Location entry 27: OK\n",
"Location entry 28: OK\n",
"Location entry 29: OK\n",
"Location entry 30: OK\n",
"Location entry 31: OK\n",
"Location entry 32: OK\n",
"Location entry 33: OK\n",
"Location entry 34: OK\n",
"Location entry 35: OK\n",
"Location entry 36: OK\n",
"Location entry 37: OK\n",
"Location entry 38: OK\n",
"Location entry 39: OK\n",
"Location entry 40: OK\n",
"Location entry 41: OK\n",
"Location entry 42: OK\n",
"Location entry 43: OK\n",
"Location entry 44: OK\n",
"Location entry 45: OK\n",
"Location entry 46: OK\n",
"Location entry 47: OK\n",
"Location entry 48: OK\n",
"Location entry 49: OK\n",
"Location entry 50: OK\n",
"Location entry 51: OK\n",
"Location entry 52: OK\n",
"Location entry 53: OK\n",
"Location entry 54: OK\n",
"Location entry 55: OK\n",
"Location entry 56: OK\n",
"Location entry 57: OK\n",
"Location entry 58: OK\n",
"Location entry 59: OK\n",
"Location entry 60: OK\n",
"Location entry 61: OK\n",
"Location entry 62: OK\n",
"Location entry 63: OK\n",
"Location entry 64: OK\n",
"Location entry 65: OK\n",
"Location entry 66: OK\n",
"Location entry 67: OK\n",
"Location entry 68: OK\n",
"Location entry 69: OK\n",
"Location entry 70: OK\n",
"Location entry 71: OK\n",
"Location entry 72: OK\n",
"Location entry 73: OK\n",
"Location entry 74: OK\n",
"Location entry 75: OK\n"
]
}
],
"source": [
"df_invalid.iloc[1]['address'] = 'Fake street'\n",
"df_invalid.iloc[14]['address'] = np.nan\n",
"df_invalid.iloc[20]['latitude'] = np.nan\n",
"#df_invalid.head(21)\n",
"\n",
"location_validation(df_invalid)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "452bc90a",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}