LucaCappelletti94/csv_trimming

View on GitHub
.ipynb_checkpoints/Untitled-checkpoint.ipynb

Summary

Maintainability
Test Coverage
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "from random_csv_generator import random_csv\n",
    "from ugly_csv_generator import uglify\n",
    "from csv_trimming.utils import is_na\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "99640387b9314b639540afffff04f99d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Adding empty columns', max=17, style=ProgressStyle(descriptio…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "csv = random_csv(5)\n",
    "ugly = uglify(csv, duplicate_schema=False, empty_rows=False, seed=30)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "def mask_edges(mask:np.ndarray):\n",
    "    left = right = 0\n",
    "    for val in mask:\n",
    "        if not val:\n",
    "            break\n",
    "        left += 1\n",
    "    for val in np.flip(mask):\n",
    "        if not val:\n",
    "            break\n",
    "        right += 1\n",
    "    mask[left:-right] = False\n",
    "    return mask"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "def trim_padding(csv):\n",
    "    nan_mask = csv.applymap(is_na)\n",
    "    rows_mask = mask_edges(nan_mask.all(axis=1).values)\n",
    "    columns_mask = mask_edges(nan_mask.all(axis=0).values)\n",
    "    return csv[~rows_mask][csv.columns[~columns_mask]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "def trim_single_values(csv):\n",
    "    nan_mask = csv.applymap(is_na)\n",
    "    columns_mask = (~nan_mask.sum())<=1\n",
    "    print(columns_mask)\n",
    "    return csv[csv.columns[~columns_mask]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "16    5\n",
      "17    0\n",
      "18    5\n",
      "19    0\n",
      "20    0\n",
      "21    5\n",
      "22    0\n",
      "23    5\n",
      "24    5\n",
      "25    0\n",
      "26    0\n",
      "27    5\n",
      "28    0\n",
      "29    0\n",
      "30    0\n",
      "31    5\n",
      "32    5\n",
      "33    0\n",
      "34    0\n",
      "35    0\n",
      "36    5\n",
      "37    5\n",
      "38    5\n",
      "39    5\n",
      "40    5\n",
      "41    5\n",
      "42    0\n",
      "43    5\n",
      "44    0\n",
      "45    0\n",
      "46    0\n",
      "47    0\n",
      "48    0\n",
      "49    5\n",
      "50    0\n",
      "51    5\n",
      "dtype: int64\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>46</th>\n",
       "      <th>51</th>\n",
       "      <th>46</th>\n",
       "      <th>51</th>\n",
       "      <th>51</th>\n",
       "      <th>46</th>\n",
       "      <th>51</th>\n",
       "      <th>46</th>\n",
       "      <th>46</th>\n",
       "      <th>51</th>\n",
       "      <th>51</th>\n",
       "      <th>46</th>\n",
       "      <th>51</th>\n",
       "      <th>51</th>\n",
       "      <th>51</th>\n",
       "      <th>46</th>\n",
       "      <th>46</th>\n",
       "      <th>51</th>\n",
       "      <th>51</th>\n",
       "      <th>51</th>\n",
       "      <th>46</th>\n",
       "      <th>46</th>\n",
       "      <th>46</th>\n",
       "      <th>46</th>\n",
       "      <th>46</th>\n",
       "      <th>46</th>\n",
       "      <th>51</th>\n",
       "      <th>46</th>\n",
       "      <th>51</th>\n",
       "      <th>51</th>\n",
       "      <th>51</th>\n",
       "      <th>51</th>\n",
       "      <th>51</th>\n",
       "      <th>46</th>\n",
       "      <th>51</th>\n",
       "      <th>46</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>province_code</td>\n",
       "      <td>birth_province.1</td>\n",
       "      <td>province_code</td>\n",
       "      <td>birth_province.1</td>\n",
       "      <td>birth_province.1</td>\n",
       "      <td>province_code</td>\n",
       "      <td>birth_province.1</td>\n",
       "      <td>province_code</td>\n",
       "      <td>province_code</td>\n",
       "      <td>birth_province.1</td>\n",
       "      <td>birth_province.1</td>\n",
       "      <td>province_code</td>\n",
       "      <td>birth_province.1</td>\n",
       "      <td>birth_province.1</td>\n",
       "      <td>birth_province.1</td>\n",
       "      <td>province_code</td>\n",
       "      <td>province_code</td>\n",
       "      <td>birth_province.1</td>\n",
       "      <td>birth_province.1</td>\n",
       "      <td>birth_province.1</td>\n",
       "      <td>province_code</td>\n",
       "      <td>province_code</td>\n",
       "      <td>province_code</td>\n",
       "      <td>province_code</td>\n",
       "      <td>province_code</td>\n",
       "      <td>province_code</td>\n",
       "      <td>birth_province.1</td>\n",
       "      <td>province_code</td>\n",
       "      <td>birth_province.1</td>\n",
       "      <td>birth_province.1</td>\n",
       "      <td>birth_province.1</td>\n",
       "      <td>birth_province.1</td>\n",
       "      <td>birth_province.1</td>\n",
       "      <td>province_code</td>\n",
       "      <td>birth_province.1</td>\n",
       "      <td>province_code</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>CT</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CT</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CT</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CT</td>\n",
       "      <td>CT</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CT</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CT</td>\n",
       "      <td>CT</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CT</td>\n",
       "      <td>CT</td>\n",
       "      <td>CT</td>\n",
       "      <td>CT</td>\n",
       "      <td>CT</td>\n",
       "      <td>CT</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CT</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CT</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CT</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>TO</td>\n",
       "      <td>NaN</td>\n",
       "      <td>TO</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>TO</td>\n",
       "      <td>NaN</td>\n",
       "      <td>TO</td>\n",
       "      <td>TO</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>TO</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>TO</td>\n",
       "      <td>TO</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>TO</td>\n",
       "      <td>TO</td>\n",
       "      <td>TO</td>\n",
       "      <td>TO</td>\n",
       "      <td>TO</td>\n",
       "      <td>TO</td>\n",
       "      <td>NaN</td>\n",
       "      <td>TO</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>TO</td>\n",
       "      <td>NaN</td>\n",
       "      <td>TO</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>SV</td>\n",
       "      <td>NaN</td>\n",
       "      <td>SV</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>SV</td>\n",
       "      <td>NaN</td>\n",
       "      <td>SV</td>\n",
       "      <td>SV</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>SV</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>SV</td>\n",
       "      <td>SV</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>SV</td>\n",
       "      <td>SV</td>\n",
       "      <td>SV</td>\n",
       "      <td>SV</td>\n",
       "      <td>SV</td>\n",
       "      <td>SV</td>\n",
       "      <td>NaN</td>\n",
       "      <td>SV</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>SV</td>\n",
       "      <td>NaN</td>\n",
       "      <td>SV</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>6</td>\n",
       "      <td>BG</td>\n",
       "      <td>NaN</td>\n",
       "      <td>BG</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>BG</td>\n",
       "      <td>NaN</td>\n",
       "      <td>BG</td>\n",
       "      <td>BG</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>BG</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>BG</td>\n",
       "      <td>BG</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>BG</td>\n",
       "      <td>BG</td>\n",
       "      <td>BG</td>\n",
       "      <td>BG</td>\n",
       "      <td>BG</td>\n",
       "      <td>BG</td>\n",
       "      <td>NaN</td>\n",
       "      <td>BG</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>BG</td>\n",
       "      <td>NaN</td>\n",
       "      <td>BG</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>7</td>\n",
       "      <td>CR</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CR</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CR</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CR</td>\n",
       "      <td>CR</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CR</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CR</td>\n",
       "      <td>CR</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CR</td>\n",
       "      <td>CR</td>\n",
       "      <td>CR</td>\n",
       "      <td>CR</td>\n",
       "      <td>CR</td>\n",
       "      <td>CR</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CR</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CR</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CR</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              46                51             46  ...             46                51             46\n",
       "2  province_code  birth_province.1  province_code  ...  province_code  birth_province.1  province_code\n",
       "3             CT               NaN             CT  ...             CT               NaN             CT\n",
       "4             TO               NaN             TO  ...             TO               NaN             TO\n",
       "5             SV               NaN             SV  ...             SV               NaN             SV\n",
       "6             BG               NaN             BG  ...             BG               NaN             BG\n",
       "7             CR               NaN             CR  ...             CR               NaN             CR\n",
       "\n",
       "[6 rows x 36 columns]"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trim_single_values(trim_padding(ugly))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}