{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# National Water Data: HYDAT\n",
"\n",
"The Hydat data is made available through the following Government of Canada website: http://collaboration.cmc.ec.gc.ca/cmc/hydrometrics/www/. \n",
"\n",
"The daily flow data is extracted from a provided MS access database and after transformation the data is stored into a SQL server database."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2018-03-11T05:22:25.782648Z",
"start_time": "2018-03-11T05:22:23.786453Z"
}
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import warnings\n",
"import PythonTools as PT\n",
"import os\n",
"import sys\n",
"%matplotlib inline\n",
"%load_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2018-03-11T05:22:25.960324Z",
"start_time": "2018-03-11T05:22:25.784649Z"
}
},
"outputs": [],
"source": [
"import urllib.request\n",
"\n",
"sourcefile = urllib.request.urlretrieve(\"http://collaboration.cmc.ec.gc.ca/cmc/hydrometrics/www/Hydat_sqlite3_20180117.zip\",\n",
" \"Data/Hydat_sqlite3_20180117.zip\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2018-03-11T05:22:26.138959Z",
"start_time": "2018-03-11T05:22:25.962324Z"
},
"collapsed": true
},
"outputs": [],
"source": [
"PT.unzip_file(path_to_zip_file='Data/Hydat_sqlite3_20180117.zip', destination='Data')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2018-03-11T05:22:26.312087Z",
"start_time": "2018-03-11T05:22:26.140963Z"
},
"collapsed": true
},
"outputs": [],
"source": [
"hydat_path = \"Data/Hydat.sqlite3\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Hydat data\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2018-03-11T05:22:26.486431Z",
"start_time": "2018-03-11T05:22:26.314086Z"
},
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Sqliet version: 2.6.0\n",
"Successful Connection\n"
]
}
],
"source": [
"hydat_connection = PT.Sqlite3Connect(hydat_path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Input Query (data Extraction)\n",
"\n",
"A query to extract a portion of the data to perform required transformation followed by data analysis."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"ExecuteTime": {
"end_time": "2018-03-11T05:23:12.380005Z",
"start_time": "2018-03-11T05:22:26.488433Z"
},
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" STATION_NUMBER | \n",
" YEAR | \n",
" MONTH | \n",
" FLOW1 | \n",
" FLOW_SYMBOL1 | \n",
" FLOW2 | \n",
" FLOW_SYMBOL2 | \n",
" FLOW3 | \n",
" FLOW_SYMBOL3 | \n",
" FLOW4 | \n",
" ... | \n",
" FLOW28 | \n",
" FLOW_SYMBOL28 | \n",
" FLOW29 | \n",
" FLOW_SYMBOL29 | \n",
" FLOW30 | \n",
" FLOW_SYMBOL30 | \n",
" FLOW31 | \n",
" FLOW_SYMBOL31 | \n",
" MIN | \n",
" MAX | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 01AD001 | \n",
" 1928 | \n",
" 10 | \n",
" 16.600000 | \n",
" E | \n",
" 16.600000 | \n",
" E | \n",
" 16.600000 | \n",
" E | \n",
" 16.600000 | \n",
" ... | \n",
" 29.700001 | \n",
" E | \n",
" 29.700001 | \n",
" E | \n",
" 29.700001 | \n",
" E | \n",
" 29.700001 | \n",
" E | \n",
" 15.300000 | \n",
" 29.700001 | \n",
"
\n",
" \n",
" 1 | \n",
" 01AD001 | \n",
" 1928 | \n",
" 11 | \n",
" 29.700001 | \n",
" E | \n",
" 29.700001 | \n",
" E | \n",
" 29.700001 | \n",
" E | \n",
" 29.700001 | \n",
" ... | \n",
" 31.400000 | \n",
" None | \n",
" 31.400000 | \n",
" None | \n",
" 29.200001 | \n",
" None | \n",
" NaN | \n",
" None | \n",
" 29.200001 | \n",
" 34.000000 | \n",
"
\n",
" \n",
" 2 | \n",
" 01AD001 | \n",
" 1928 | \n",
" 12 | \n",
" 29.200001 | \n",
" None | \n",
" 29.200001 | \n",
" None | \n",
" 29.200001 | \n",
" None | \n",
" 29.200001 | \n",
" ... | \n",
" 17.600000 | \n",
" None | \n",
" 17.600000 | \n",
" None | \n",
" 17.600000 | \n",
" None | \n",
" 17.600000 | \n",
" None | \n",
" 17.600000 | \n",
" 29.200001 | \n",
"
\n",
" \n",
" 3 | \n",
" 01AD001 | \n",
" 1929 | \n",
" 1 | \n",
" 16.100000 | \n",
" B | \n",
" 16.100000 | \n",
" B | \n",
" 16.100000 | \n",
" B | \n",
" 16.100000 | \n",
" ... | \n",
" 17.400000 | \n",
" B | \n",
" 17.400000 | \n",
" B | \n",
" 17.400000 | \n",
" B | \n",
" 17.400000 | \n",
" B | \n",
" 16.100000 | \n",
" 17.400000 | \n",
"
\n",
" \n",
" 4 | \n",
" 01AD001 | \n",
" 1929 | \n",
" 2 | \n",
" 16.299999 | \n",
" B | \n",
" 16.299999 | \n",
" B | \n",
" 16.299999 | \n",
" B | \n",
" 16.299999 | \n",
" ... | \n",
" 13.500000 | \n",
" B | \n",
" NaN | \n",
" None | \n",
" NaN | \n",
" None | \n",
" NaN | \n",
" None | \n",
" 13.500000 | \n",
" 16.299999 | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 67 columns
\n",
"
"
],
"text/plain": [
" STATION_NUMBER YEAR MONTH FLOW1 FLOW_SYMBOL1 FLOW2 FLOW_SYMBOL2 \\\n",
"0 01AD001 1928 10 16.600000 E 16.600000 E \n",
"1 01AD001 1928 11 29.700001 E 29.700001 E \n",
"2 01AD001 1928 12 29.200001 None 29.200001 None \n",
"3 01AD001 1929 1 16.100000 B 16.100000 B \n",
"4 01AD001 1929 2 16.299999 B 16.299999 B \n",
"\n",
" FLOW3 FLOW_SYMBOL3 FLOW4 ... FLOW28 FLOW_SYMBOL28 \\\n",
"0 16.600000 E 16.600000 ... 29.700001 E \n",
"1 29.700001 E 29.700001 ... 31.400000 None \n",
"2 29.200001 None 29.200001 ... 17.600000 None \n",
"3 16.100000 B 16.100000 ... 17.400000 B \n",
"4 16.299999 B 16.299999 ... 13.500000 B \n",
"\n",
" FLOW29 FLOW_SYMBOL29 FLOW30 FLOW_SYMBOL30 FLOW31 \\\n",
"0 29.700001 E 29.700001 E 29.700001 \n",
"1 31.400000 None 29.200001 None NaN \n",
"2 17.600000 None 17.600000 None 17.600000 \n",
"3 17.400000 B 17.400000 B 17.400000 \n",
"4 NaN None NaN None NaN \n",
"\n",
" FLOW_SYMBOL31 MIN MAX \n",
"0 E 15.300000 29.700001 \n",
"1 None 29.200001 34.000000 \n",
"2 None 17.600000 29.200001 \n",
"3 B 16.100000 17.400000 \n",
"4 None 13.500000 16.299999 \n",
"\n",
"[5 rows x 67 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"query='''\n",
"SELECT\n",
" F.STATION_NUMBER\n",
" ,YEAR\n",
" ,MONTH\n",
" ,FLOW1\n",
" ,FLOW_SYMBOL1\n",
" ,FLOW2\n",
" ,FLOW_SYMBOL2\n",
" ,FLOW3\n",
" ,FLOW_SYMBOL3\n",
" ,FLOW4\n",
" ,FLOW_SYMBOL4\n",
" ,FLOW5\n",
" ,FLOW_SYMBOL5\n",
" ,FLOW6\n",
" ,FLOW_SYMBOL6\n",
" ,FLOW7\n",
" ,FLOW_SYMBOL7\n",
" ,FLOW8\n",
" ,FLOW_SYMBOL8\n",
" ,FLOW9\n",
" ,FLOW_SYMBOL9\n",
" ,FLOW10\n",
" ,FLOW_SYMBOL10\n",
" ,FLOW11\n",
" ,FLOW_SYMBOL11\n",
" ,FLOW12\n",
" ,FLOW_SYMBOL12\n",
" ,FLOW13\n",
" ,FLOW_SYMBOL13\n",
" ,FLOW14\n",
" ,FLOW_SYMBOL14\n",
" ,FLOW15\n",
" ,FLOW_SYMBOL15\n",
" ,FLOW16\n",
" ,FLOW_SYMBOL16\n",
" ,FLOW17\n",
" ,FLOW_SYMBOL17\n",
" ,FLOW18\n",
" ,FLOW_SYMBOL18\n",
" ,FLOW19\n",
" ,FLOW_SYMBOL19\n",
" ,FLOW20\n",
" ,FLOW_SYMBOL20\n",
" ,FLOW21\n",
" ,FLOW_SYMBOL21\n",
" ,FLOW22\n",
" ,FLOW_SYMBOL22\n",
" ,FLOW23\n",
" ,FLOW_SYMBOL23\n",
" ,FLOW24\n",
" ,FLOW_SYMBOL24\n",
" ,FLOW25\n",
" ,FLOW_SYMBOL25\n",
" ,FLOW26\n",
" ,FLOW_SYMBOL26\n",
" ,FLOW27\n",
" ,FLOW_SYMBOL27\n",
" ,FLOW28\n",
" ,FLOW_SYMBOL28\n",
" ,FLOW29\n",
" ,FLOW_SYMBOL29\n",
" ,FLOW30\n",
" ,FLOW_SYMBOL30\n",
" ,FLOW31\n",
" ,FLOW_SYMBOL31\n",
" ,MIN\n",
" ,MAX\n",
"FROM DLY_FLOWS F\n",
"INNER JOIN STATIONS S\n",
" ON F.STATION_NUMBER = S.STATION_NUMBER\n",
"'''\n",
"df_hydat = hydat_connection.execute_query(query)\n",
"df_hydat.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Choose a subset of data"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"ExecuteTime": {
"end_time": "2018-03-11T05:23:12.659053Z",
"start_time": "2018-03-11T05:23:12.388011Z"
},
"collapsed": true
},
"outputs": [],
"source": [
"df_hydat = df_hydat.head(1000)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Break table (Days, Symbols)\n",
"\n",
"Each day has a flow value along with a symbol. The transformation un-pivots the source data based on daily flow and symbol. This is implemented below and a preview of the results are provided."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"ExecuteTime": {
"end_time": "2018-03-11T05:23:12.869161Z",
"start_time": "2018-03-11T05:23:12.661055Z"
},
"collapsed": true
},
"outputs": [],
"source": [
"basecolumn = ['STATION_NUMBER', 'YEAR', 'MONTH', 'MIN', 'MAX']\n",
"column_flow_day = basecolumn+['FLOW%i'%(i) for i in range(1,32)]\n",
"column_symbol_day= basecolumn+['FLOW_SYMBOL%i'%(i) for i in range(1,32)]\n",
"\n",
"df_hydat_flow = df_hydat[column_flow_day]\n",
"df_hydat_symbol = df_hydat[column_symbol_day]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Unpivot Days (melt)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"ExecuteTime": {
"end_time": "2018-03-11T05:23:13.094857Z",
"start_time": "2018-03-11T05:23:12.871164Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" STATION_NUMBER | \n",
" YEAR | \n",
" MONTH | \n",
" MIN | \n",
" MAX | \n",
" Day | \n",
" FlowValue | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 01AD001 | \n",
" 1928 | \n",
" 10 | \n",
" 15.300000 | \n",
" 29.700001 | \n",
" FLOW1 | \n",
" 16.600000 | \n",
"
\n",
" \n",
" 1 | \n",
" 01AD001 | \n",
" 1928 | \n",
" 11 | \n",
" 29.200001 | \n",
" 34.000000 | \n",
" FLOW1 | \n",
" 29.700001 | \n",
"
\n",
" \n",
" 2 | \n",
" 01AD001 | \n",
" 1928 | \n",
" 12 | \n",
" 17.600000 | \n",
" 29.200001 | \n",
" FLOW1 | \n",
" 29.200001 | \n",
"
\n",
" \n",
" 3 | \n",
" 01AD001 | \n",
" 1929 | \n",
" 1 | \n",
" 16.100000 | \n",
" 17.400000 | \n",
" FLOW1 | \n",
" 16.100000 | \n",
"
\n",
" \n",
" 4 | \n",
" 01AD001 | \n",
" 1929 | \n",
" 2 | \n",
" 13.500000 | \n",
" 16.299999 | \n",
" FLOW1 | \n",
" 16.299999 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" STATION_NUMBER YEAR MONTH MIN MAX Day FlowValue\n",
"0 01AD001 1928 10 15.300000 29.700001 FLOW1 16.600000\n",
"1 01AD001 1928 11 29.200001 34.000000 FLOW1 29.700001\n",
"2 01AD001 1928 12 17.600000 29.200001 FLOW1 29.200001\n",
"3 01AD001 1929 1 16.100000 17.400000 FLOW1 16.100000\n",
"4 01AD001 1929 2 13.500000 16.299999 FLOW1 16.299999"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_hydat_flow_melt = pd.melt(df_hydat_flow,\n",
" id_vars=['STATION_NUMBER', 'YEAR', 'MONTH', 'MIN', 'MAX'],\n",
" var_name = 'Day',\n",
" value_name=\"FlowValue\")\n",
"df_hydat_flow_melt.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Unpivot Symbols"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"ExecuteTime": {
"end_time": "2018-03-11T05:23:13.332832Z",
"start_time": "2018-03-11T05:23:13.097860Z"
},
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" STATION_NUMBER | \n",
" YEAR | \n",
" MONTH | \n",
" MIN | \n",
" MAX | \n",
" FlowSymbol | \n",
" Symbol | \n",
" Day | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 01AD001 | \n",
" 1928 | \n",
" 10 | \n",
" 15.300000 | \n",
" 29.700001 | \n",
" FLOW_SYMBOL1 | \n",
" E | \n",
" FLOW1 | \n",
"
\n",
" \n",
" 1 | \n",
" 01AD001 | \n",
" 1928 | \n",
" 11 | \n",
" 29.200001 | \n",
" 34.000000 | \n",
" FLOW_SYMBOL1 | \n",
" E | \n",
" FLOW1 | \n",
"
\n",
" \n",
" 2 | \n",
" 01AD001 | \n",
" 1928 | \n",
" 12 | \n",
" 17.600000 | \n",
" 29.200001 | \n",
" FLOW_SYMBOL1 | \n",
" None | \n",
" FLOW1 | \n",
"
\n",
" \n",
" 3 | \n",
" 01AD001 | \n",
" 1929 | \n",
" 1 | \n",
" 16.100000 | \n",
" 17.400000 | \n",
" FLOW_SYMBOL1 | \n",
" B | \n",
" FLOW1 | \n",
"
\n",
" \n",
" 4 | \n",
" 01AD001 | \n",
" 1929 | \n",
" 2 | \n",
" 13.500000 | \n",
" 16.299999 | \n",
" FLOW_SYMBOL1 | \n",
" B | \n",
" FLOW1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" STATION_NUMBER YEAR MONTH MIN MAX FlowSymbol Symbol \\\n",
"0 01AD001 1928 10 15.300000 29.700001 FLOW_SYMBOL1 E \n",
"1 01AD001 1928 11 29.200001 34.000000 FLOW_SYMBOL1 E \n",
"2 01AD001 1928 12 17.600000 29.200001 FLOW_SYMBOL1 None \n",
"3 01AD001 1929 1 16.100000 17.400000 FLOW_SYMBOL1 B \n",
"4 01AD001 1929 2 13.500000 16.299999 FLOW_SYMBOL1 B \n",
"\n",
" Day \n",
"0 FLOW1 \n",
"1 FLOW1 \n",
"2 FLOW1 \n",
"3 FLOW1 \n",
"4 FLOW1 "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_hydat_symbol_melt = pd.melt(df_hydat_symbol,\n",
" id_vars=['STATION_NUMBER', 'YEAR', 'MONTH', 'MIN', 'MAX'],\n",
" var_name = 'FlowSymbol',\n",
" value_name=\"Symbol\")\n",
"df_hydat_symbol_melt['Day'] = df_hydat_symbol_melt['FlowSymbol'].apply(lambda s: 'FLOW'+s.split('FLOW_SYMBOL')[1])\n",
"df_hydat_symbol_melt.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Join tables\n",
"\n",
"The un-pivoted tables for daily flow and flow symbol are merged/joined together and the extra columns are excluded."
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"ExecuteTime": {
"end_time": "2018-03-11T05:23:13.578508Z",
"start_time": "2018-03-11T05:23:13.335834Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" STATION_NUMBER | \n",
" YEAR | \n",
" MONTH | \n",
" MIN | \n",
" MAX | \n",
" Day | \n",
" FlowValue | \n",
" MIN_ | \n",
" MAX_ | \n",
" FlowSymbol | \n",
" Symbol | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 01AD001 | \n",
" 1928 | \n",
" 10 | \n",
" 15.300000 | \n",
" 29.700001 | \n",
" FLOW1 | \n",
" 16.600000 | \n",
" 15.300000 | \n",
" 29.700001 | \n",
" FLOW_SYMBOL1 | \n",
" E | \n",
"
\n",
" \n",
" 1 | \n",
" 01AD001 | \n",
" 1928 | \n",
" 11 | \n",
" 29.200001 | \n",
" 34.000000 | \n",
" FLOW1 | \n",
" 29.700001 | \n",
" 29.200001 | \n",
" 34.000000 | \n",
" FLOW_SYMBOL1 | \n",
" E | \n",
"
\n",
" \n",
" 2 | \n",
" 01AD001 | \n",
" 1928 | \n",
" 12 | \n",
" 17.600000 | \n",
" 29.200001 | \n",
" FLOW1 | \n",
" 29.200001 | \n",
" 17.600000 | \n",
" 29.200001 | \n",
" FLOW_SYMBOL1 | \n",
" None | \n",
"
\n",
" \n",
" 3 | \n",
" 01AD001 | \n",
" 1929 | \n",
" 1 | \n",
" 16.100000 | \n",
" 17.400000 | \n",
" FLOW1 | \n",
" 16.100000 | \n",
" 16.100000 | \n",
" 17.400000 | \n",
" FLOW_SYMBOL1 | \n",
" B | \n",
"
\n",
" \n",
" 4 | \n",
" 01AD001 | \n",
" 1929 | \n",
" 2 | \n",
" 13.500000 | \n",
" 16.299999 | \n",
" FLOW1 | \n",
" 16.299999 | \n",
" 13.500000 | \n",
" 16.299999 | \n",
" FLOW_SYMBOL1 | \n",
" B | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" STATION_NUMBER YEAR MONTH MIN MAX Day FlowValue \\\n",
"0 01AD001 1928 10 15.300000 29.700001 FLOW1 16.600000 \n",
"1 01AD001 1928 11 29.200001 34.000000 FLOW1 29.700001 \n",
"2 01AD001 1928 12 17.600000 29.200001 FLOW1 29.200001 \n",
"3 01AD001 1929 1 16.100000 17.400000 FLOW1 16.100000 \n",
"4 01AD001 1929 2 13.500000 16.299999 FLOW1 16.299999 \n",
"\n",
" MIN_ MAX_ FlowSymbol Symbol \n",
"0 15.300000 29.700001 FLOW_SYMBOL1 E \n",
"1 29.200001 34.000000 FLOW_SYMBOL1 E \n",
"2 17.600000 29.200001 FLOW_SYMBOL1 None \n",
"3 16.100000 17.400000 FLOW_SYMBOL1 B \n",
"4 13.500000 16.299999 FLOW_SYMBOL1 B "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hydat_initial_join = df_hydat_flow_melt.merge(df_hydat_symbol_melt,\n",
" left_on=['STATION_NUMBER','YEAR', 'MONTH', 'Day'],\n",
" right_on=['STATION_NUMBER','YEAR', 'MONTH', 'Day'],\n",
" how='inner',suffixes=('', '_'))\n",
"hydat_initial_join.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Drop/rename columns"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"ExecuteTime": {
"end_time": "2018-03-11T05:23:13.785657Z",
"start_time": "2018-03-11T05:23:13.580511Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" STATION_NUMBER | \n",
" YEAR | \n",
" MONTH | \n",
" MIN | \n",
" MAX | \n",
" Day | \n",
" FlowValue | \n",
" Symbol | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 01AD001 | \n",
" 1928 | \n",
" 10 | \n",
" 15.300000 | \n",
" 29.700001 | \n",
" FLOW1 | \n",
" 16.600000 | \n",
" E | \n",
"
\n",
" \n",
" 1 | \n",
" 01AD001 | \n",
" 1928 | \n",
" 11 | \n",
" 29.200001 | \n",
" 34.000000 | \n",
" FLOW1 | \n",
" 29.700001 | \n",
" E | \n",
"
\n",
" \n",
" 2 | \n",
" 01AD001 | \n",
" 1928 | \n",
" 12 | \n",
" 17.600000 | \n",
" 29.200001 | \n",
" FLOW1 | \n",
" 29.200001 | \n",
" None | \n",
"
\n",
" \n",
" 3 | \n",
" 01AD001 | \n",
" 1929 | \n",
" 1 | \n",
" 16.100000 | \n",
" 17.400000 | \n",
" FLOW1 | \n",
" 16.100000 | \n",
" B | \n",
"
\n",
" \n",
" 4 | \n",
" 01AD001 | \n",
" 1929 | \n",
" 2 | \n",
" 13.500000 | \n",
" 16.299999 | \n",
" FLOW1 | \n",
" 16.299999 | \n",
" B | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" STATION_NUMBER YEAR MONTH MIN MAX Day FlowValue Symbol\n",
"0 01AD001 1928 10 15.300000 29.700001 FLOW1 16.600000 E\n",
"1 01AD001 1928 11 29.200001 34.000000 FLOW1 29.700001 E\n",
"2 01AD001 1928 12 17.600000 29.200001 FLOW1 29.200001 None\n",
"3 01AD001 1929 1 16.100000 17.400000 FLOW1 16.100000 B\n",
"4 01AD001 1929 2 13.500000 16.299999 FLOW1 16.299999 B"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hydat_initial_join.drop(['MIN_', 'MAX_','FlowSymbol'], axis=1, inplace=True)\n",
"hydat_initial_join.head()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"ExecuteTime": {
"end_time": "2018-03-11T05:23:14.018040Z",
"start_time": "2018-03-11T05:23:13.787660Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" STATION_NUMBER | \n",
" YEAR | \n",
" MONTH | \n",
" MIN | \n",
" MAX | \n",
" Day | \n",
" FlowValue | \n",
" Symbol | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 01AD001 | \n",
" 1928 | \n",
" 10 | \n",
" 15.300000 | \n",
" 29.700001 | \n",
" 1 | \n",
" 16.600000 | \n",
" E | \n",
"
\n",
" \n",
" 1 | \n",
" 01AD001 | \n",
" 1928 | \n",
" 11 | \n",
" 29.200001 | \n",
" 34.000000 | \n",
" 1 | \n",
" 29.700001 | \n",
" E | \n",
"
\n",
" \n",
" 2 | \n",
" 01AD001 | \n",
" 1928 | \n",
" 12 | \n",
" 17.600000 | \n",
" 29.200001 | \n",
" 1 | \n",
" 29.200001 | \n",
" None | \n",
"
\n",
" \n",
" 3 | \n",
" 01AD001 | \n",
" 1929 | \n",
" 1 | \n",
" 16.100000 | \n",
" 17.400000 | \n",
" 1 | \n",
" 16.100000 | \n",
" B | \n",
"
\n",
" \n",
" 4 | \n",
" 01AD001 | \n",
" 1929 | \n",
" 2 | \n",
" 13.500000 | \n",
" 16.299999 | \n",
" 1 | \n",
" 16.299999 | \n",
" B | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" STATION_NUMBER YEAR MONTH MIN MAX Day FlowValue Symbol\n",
"0 01AD001 1928 10 15.300000 29.700001 1 16.600000 E\n",
"1 01AD001 1928 11 29.200001 34.000000 1 29.700001 E\n",
"2 01AD001 1928 12 17.600000 29.200001 1 29.200001 None\n",
"3 01AD001 1929 1 16.100000 17.400000 1 16.100000 B\n",
"4 01AD001 1929 2 13.500000 16.299999 1 16.299999 B"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hydat_initial_join['Day'] = hydat_initial_join['Day'].apply( lambda d: int( d.split('FLOW')[1] ) )\n",
"hydat_initial_join.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Adding date\n",
"\n",
"Year, month and day columns are combined to provide a date column that can be used for data analysis."
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"ExecuteTime": {
"end_time": "2018-03-11T05:23:15.910931Z",
"start_time": "2018-03-11T05:23:14.020043Z"
},
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" STATION_NUMBER | \n",
" YEAR | \n",
" MONTH | \n",
" MIN | \n",
" MAX | \n",
" Day | \n",
" FlowValue | \n",
" Symbol | \n",
" DateKey | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 01AD001 | \n",
" 1928 | \n",
" 10 | \n",
" 15.300000 | \n",
" 29.700001 | \n",
" 1 | \n",
" 16.600000 | \n",
" E | \n",
" 19281001 | \n",
"
\n",
" \n",
" 1 | \n",
" 01AD001 | \n",
" 1928 | \n",
" 11 | \n",
" 29.200001 | \n",
" 34.000000 | \n",
" 1 | \n",
" 29.700001 | \n",
" E | \n",
" 19281101 | \n",
"
\n",
" \n",
" 2 | \n",
" 01AD001 | \n",
" 1928 | \n",
" 12 | \n",
" 17.600000 | \n",
" 29.200001 | \n",
" 1 | \n",
" 29.200001 | \n",
" None | \n",
" 19281201 | \n",
"
\n",
" \n",
" 3 | \n",
" 01AD001 | \n",
" 1929 | \n",
" 1 | \n",
" 16.100000 | \n",
" 17.400000 | \n",
" 1 | \n",
" 16.100000 | \n",
" B | \n",
" 19290101 | \n",
"
\n",
" \n",
" 4 | \n",
" 01AD001 | \n",
" 1929 | \n",
" 2 | \n",
" 13.500000 | \n",
" 16.299999 | \n",
" 1 | \n",
" 16.299999 | \n",
" B | \n",
" 19290201 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" STATION_NUMBER YEAR MONTH MIN MAX Day FlowValue Symbol \\\n",
"0 01AD001 1928 10 15.300000 29.700001 1 16.600000 E \n",
"1 01AD001 1928 11 29.200001 34.000000 1 29.700001 E \n",
"2 01AD001 1928 12 17.600000 29.200001 1 29.200001 None \n",
"3 01AD001 1929 1 16.100000 17.400000 1 16.100000 B \n",
"4 01AD001 1929 2 13.500000 16.299999 1 16.299999 B \n",
"\n",
" DateKey \n",
"0 19281001 \n",
"1 19281101 \n",
"2 19281201 \n",
"3 19290101 \n",
"4 19290201 "
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hydat_initial_join['DateKey'] = hydat_initial_join[['YEAR', 'MONTH', 'Day']].apply(lambda x : \"{}{}{}\".format(x['YEAR'], str(x['MONTH']).zfill(2), str(x['Day']).zfill(2)), axis=1)\n",
"hydat_initial_join.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Export to CSV\n",
"\n",
"The final result of the transformations for hydat data is written to a csv file."
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"ExecuteTime": {
"end_time": "2018-03-11T05:23:16.472148Z",
"start_time": "2018-03-11T05:23:15.913933Z"
},
"collapsed": true
},
"outputs": [],
"source": [
"PT.write_csv(hydat_initial_join,flname='Hydat.csv', chunksize=10000)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Write to SQL server"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Initialize a connection"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"ExecuteTime": {
"end_time": "2018-03-11T05:23:16.761733Z",
"start_time": "2018-03-11T05:23:16.472148Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Successful Connection\n"
]
}
],
"source": [
"connection = PT.ServerConnect(server='DESKTOP-AN4AQCT\\SQLEXPRESS', database='Dev')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Get server version"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"ExecuteTime": {
"end_time": "2018-03-11T05:23:16.980893Z",
"start_time": "2018-03-11T05:23:16.763735Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Microsoft SQL Server 2016 (SP1-GDR) (KB4019089) - 13.0.4206.0 (X64) \n",
"\tJul 6 2017 07:55:03 \n",
"\tCopyright (c) Microsoft Corporation\n",
"\tExpress Edition (64-bit) on Windows 10 Home 6.3 (Build 16299: )\n",
"\n"
]
}
],
"source": [
"print(connection.get_server_version())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Create a new table"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"ExecuteTime": {
"end_time": "2018-03-11T05:23:17.203065Z",
"start_time": "2018-03-11T05:23:16.982895Z"
},
"collapsed": true
},
"outputs": [],
"source": [
"cmd = '''\n",
"DROP TABLE IF EXISTS [{schema}].[{TableName}]\n",
"CREATE TABLE [{schema}].[{TableName}]\n",
"(\n",
" [RowId] INT IDENTITY (1, 1) NOT NULL,\n",
" [STATION_NUMBER] NVARCHAR(10) Null,\n",
" [YEAR] INT Null,\n",
" [MONTH] INT Null,\n",
" [MIN] DECIMAL(19,5) Null,\n",
" [MAX] DECIMAL(19,5) Null,\n",
" [Day] INT Null,\n",
" [FlowValue] DECIMAL(19,5) Null,\n",
" [Symbol] NVARCHAR(5) Null,\n",
" [DateKey] INT Null,\n",
")\n",
"\n",
"CREATE CLUSTERED COLUMNSTORE INDEX IX_Hydat_DailyFlow ON [{schema}].[{TableName}]\n",
"\n",
"\n",
"'''.format(schema = 'dbo', TableName='Hydat_FlowData')\n",
"connection.deploy(cmd)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Write to SQL Server"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"ExecuteTime": {
"end_time": "2018-03-11T05:23:22.778804Z",
"start_time": "2018-03-11T05:23:17.205065Z"
}
},
"outputs": [],
"source": [
"PT.write_to_sql(hydat_initial_join,table_name='Hydat_FlowData',if_exists='append', schema='dbo',\n",
" connection_string=connection.connection_string)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Read Back From Server"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"ExecuteTime": {
"end_time": "2018-03-11T05:23:23.483822Z",
"start_time": "2018-03-11T05:23:22.778804Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" RowId | \n",
" STATION_NUMBER | \n",
" YEAR | \n",
" MONTH | \n",
" MIN | \n",
" MAX | \n",
" Day | \n",
" FlowValue | \n",
" Symbol | \n",
" DateKey | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 01AD001 | \n",
" 1928 | \n",
" 10 | \n",
" 15.3 | \n",
" 29.7 | \n",
" 1 | \n",
" 16.6 | \n",
" E | \n",
" 19281001 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" 01AD001 | \n",
" 1928 | \n",
" 11 | \n",
" 29.2 | \n",
" 34.0 | \n",
" 1 | \n",
" 29.7 | \n",
" E | \n",
" 19281101 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" 01AD001 | \n",
" 1928 | \n",
" 12 | \n",
" 17.6 | \n",
" 29.2 | \n",
" 1 | \n",
" 29.2 | \n",
" None | \n",
" 19281201 | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" 01AD001 | \n",
" 1929 | \n",
" 1 | \n",
" 16.1 | \n",
" 17.4 | \n",
" 1 | \n",
" 16.1 | \n",
" B | \n",
" 19290101 | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" 01AD001 | \n",
" 1929 | \n",
" 2 | \n",
" 13.5 | \n",
" 16.3 | \n",
" 1 | \n",
" 16.3 | \n",
" B | \n",
" 19290201 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" RowId STATION_NUMBER YEAR MONTH MIN MAX Day FlowValue Symbol \\\n",
"0 1 01AD001 1928 10 15.3 29.7 1 16.6 E \n",
"1 2 01AD001 1928 11 29.2 34.0 1 29.7 E \n",
"2 3 01AD001 1928 12 17.6 29.2 1 29.2 None \n",
"3 4 01AD001 1929 1 16.1 17.4 1 16.1 B \n",
"4 5 01AD001 1929 2 13.5 16.3 1 16.3 B \n",
"\n",
" DateKey \n",
"0 19281001 \n",
"1 19281101 \n",
"2 19281201 \n",
"3 19290101 \n",
"4 19290201 "
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"query ='''\n",
"SELECT *\n",
"FROM [{schema}].[{TableName}]\n",
"'''.format(schema = 'dbo', TableName='Hydat_FlowData')\n",
"DF_Hydat = connection.execute_query(query)\n",
"DF_Hydat.head()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"ExecuteTime": {
"end_time": "2018-03-11T05:23:23.663449Z",
"start_time": "2018-03-11T05:23:23.485826Z"
},
"collapsed": true
},
"outputs": [],
"source": [
"connection.close()"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"### Write to sqlite database"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"ExecuteTime": {
"end_time": "2018-03-11T05:23:23.959769Z",
"start_time": "2018-03-11T05:23:23.665451Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Sqliet version: 2.6.0\n",
"Successful Connection\n"
]
}
],
"source": [
"sqlite_connection = PT.Sqlite3Connect('Data/Hydat_Transformed.db')"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"ExecuteTime": {
"end_time": "2018-03-11T05:23:24.140916Z",
"start_time": "2018-03-11T05:23:23.961770Z"
}
},
"outputs": [],
"source": [
"cmd1 = '''DROP TABLE IF EXISTS {schema}.{TableName};'''.format(schema='main', TableName='Hydat_FlowData')\n",
"cmd2 = '''\n",
"CREATE TABLE \n",
"IF NOT EXISTS {schema}.{TableName}(\n",
" [STATION_NUMBER] NVARCHAR(10) Null,\n",
" [YEAR] INT Null,\n",
" [MONTH] INT Null,\n",
" [MIN] DECIMAL(19,5) Null,\n",
" [MAX] DECIMAL(19,5) Null,\n",
" [Day] INT Null,\n",
" [FlowValue] DECIMAL(19,5) Null,\n",
" [Symbol] NVARCHAR(5) Null,\n",
" [DateKey] INT Null\n",
");\n",
"'''.format(schema='main', TableName='Hydat_FlowData')"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"ExecuteTime": {
"end_time": "2018-03-11T05:23:24.575669Z",
"start_time": "2018-03-11T05:23:24.142920Z"
}
},
"outputs": [],
"source": [
"sqlite_connection.deploy(cmd1)\n",
"sqlite_connection.deploy(cmd2)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"ExecuteTime": {
"end_time": "2018-03-11T05:23:25.455274Z",
"start_time": "2018-03-11T05:23:24.580672Z"
}
},
"outputs": [],
"source": [
"PT.write_to_sql(hydat_initial_join,\n",
" table_name='Hydat_FlowData',\n",
" if_exists='append',\n",
" schema='main',\n",
" driver='sqlite',\n",
" index=False,\n",
" connection_string=sqlite_connection.database)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"ExecuteTime": {
"end_time": "2018-03-11T05:23:25.776191Z",
"start_time": "2018-03-11T05:23:25.455274Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" STATION_NUMBER | \n",
" YEAR | \n",
" MONTH | \n",
" MIN | \n",
" MAX | \n",
" Day | \n",
" FlowValue | \n",
" Symbol | \n",
" DateKey | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 01AD001 | \n",
" 1928 | \n",
" 10 | \n",
" 15.300000 | \n",
" 29.700001 | \n",
" 1 | \n",
" 16.600000 | \n",
" E | \n",
" 19281001 | \n",
"
\n",
" \n",
" 1 | \n",
" 01AD001 | \n",
" 1928 | \n",
" 11 | \n",
" 29.200001 | \n",
" 34.000000 | \n",
" 1 | \n",
" 29.700001 | \n",
" E | \n",
" 19281101 | \n",
"
\n",
" \n",
" 2 | \n",
" 01AD001 | \n",
" 1928 | \n",
" 12 | \n",
" 17.600000 | \n",
" 29.200001 | \n",
" 1 | \n",
" 29.200001 | \n",
" None | \n",
" 19281201 | \n",
"
\n",
" \n",
" 3 | \n",
" 01AD001 | \n",
" 1929 | \n",
" 1 | \n",
" 16.100000 | \n",
" 17.400000 | \n",
" 1 | \n",
" 16.100000 | \n",
" B | \n",
" 19290101 | \n",
"
\n",
" \n",
" 4 | \n",
" 01AD001 | \n",
" 1929 | \n",
" 2 | \n",
" 13.500000 | \n",
" 16.299999 | \n",
" 1 | \n",
" 16.299999 | \n",
" B | \n",
" 19290201 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" STATION_NUMBER YEAR MONTH MIN MAX Day FlowValue Symbol \\\n",
"0 01AD001 1928 10 15.300000 29.700001 1 16.600000 E \n",
"1 01AD001 1928 11 29.200001 34.000000 1 29.700001 E \n",
"2 01AD001 1928 12 17.600000 29.200001 1 29.200001 None \n",
"3 01AD001 1929 1 16.100000 17.400000 1 16.100000 B \n",
"4 01AD001 1929 2 13.500000 16.299999 1 16.299999 B \n",
"\n",
" DateKey \n",
"0 19281001 \n",
"1 19281101 \n",
"2 19281201 \n",
"3 19290101 \n",
"4 19290201 "
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sqlite_connection.execute_query('SELECT * FROM main.Hydat_FlowData').head()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"ExecuteTime": {
"end_time": "2018-03-11T05:23:25.961118Z",
"start_time": "2018-03-11T05:23:25.778192Z"
},
"collapsed": true
},
"outputs": [],
"source": [
"sqlite_connection.close()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
},
"toc": {
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"toc_cell": false,
"toc_position": {
"height": "837px",
"left": "0px",
"right": "1468px",
"top": "107px",
"width": "212px"
},
"toc_section_display": "block",
"toc_window_display": false
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"position": {
"height": "384px",
"left": "838px",
"right": "20px",
"top": "166px",
"width": "710px"
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}