{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# National Water Data: HYDAT\n", "\n", "The Hydat data is made available through the following Government of Canada website: http://collaboration.cmc.ec.gc.ca/cmc/hydrometrics/www/. \n", "\n", "The daily flow data is extracted from a provided MS access database and after transformation the data is stored into a SQL server database." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2018-03-11T05:22:25.782648Z", "start_time": "2018-03-11T05:22:23.786453Z" } }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import warnings\n", "import PythonTools as PT\n", "import os\n", "import sys\n", "%matplotlib inline\n", "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2018-03-11T05:22:25.960324Z", "start_time": "2018-03-11T05:22:25.784649Z" } }, "outputs": [], "source": [ "import urllib.request\n", "\n", "sourcefile = urllib.request.urlretrieve(\"http://collaboration.cmc.ec.gc.ca/cmc/hydrometrics/www/Hydat_sqlite3_20180117.zip\",\n", " \"Data/Hydat_sqlite3_20180117.zip\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2018-03-11T05:22:26.138959Z", "start_time": "2018-03-11T05:22:25.962324Z" }, "collapsed": true }, "outputs": [], "source": [ "PT.unzip_file(path_to_zip_file='Data/Hydat_sqlite3_20180117.zip', destination='Data')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2018-03-11T05:22:26.312087Z", "start_time": "2018-03-11T05:22:26.140963Z" }, "collapsed": true }, "outputs": [], "source": [ "hydat_path = \"Data/Hydat.sqlite3\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Hydat data\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2018-03-11T05:22:26.486431Z", "start_time": "2018-03-11T05:22:26.314086Z" }, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Sqliet version: 2.6.0\n", "Successful Connection\n" ] } ], "source": [ "hydat_connection = PT.Sqlite3Connect(hydat_path)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Input Query (data Extraction)\n", "\n", "A query to extract a portion of the data to perform required transformation followed by data analysis." ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2018-03-11T05:23:12.380005Z", "start_time": "2018-03-11T05:22:26.488433Z" }, "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
STATION_NUMBERYEARMONTHFLOW1FLOW_SYMBOL1FLOW2FLOW_SYMBOL2FLOW3FLOW_SYMBOL3FLOW4...FLOW28FLOW_SYMBOL28FLOW29FLOW_SYMBOL29FLOW30FLOW_SYMBOL30FLOW31FLOW_SYMBOL31MINMAX
001AD00119281016.600000E16.600000E16.600000E16.600000...29.700001E29.700001E29.700001E29.700001E15.30000029.700001
101AD00119281129.700001E29.700001E29.700001E29.700001...31.400000None31.400000None29.200001NoneNaNNone29.20000134.000000
201AD00119281229.200001None29.200001None29.200001None29.200001...17.600000None17.600000None17.600000None17.600000None17.60000029.200001
301AD0011929116.100000B16.100000B16.100000B16.100000...17.400000B17.400000B17.400000B17.400000B16.10000017.400000
401AD0011929216.299999B16.299999B16.299999B16.299999...13.500000BNaNNoneNaNNoneNaNNone13.50000016.299999
\n", "

5 rows × 67 columns

\n", "
" ], "text/plain": [ " STATION_NUMBER YEAR MONTH FLOW1 FLOW_SYMBOL1 FLOW2 FLOW_SYMBOL2 \\\n", "0 01AD001 1928 10 16.600000 E 16.600000 E \n", "1 01AD001 1928 11 29.700001 E 29.700001 E \n", "2 01AD001 1928 12 29.200001 None 29.200001 None \n", "3 01AD001 1929 1 16.100000 B 16.100000 B \n", "4 01AD001 1929 2 16.299999 B 16.299999 B \n", "\n", " FLOW3 FLOW_SYMBOL3 FLOW4 ... FLOW28 FLOW_SYMBOL28 \\\n", "0 16.600000 E 16.600000 ... 29.700001 E \n", "1 29.700001 E 29.700001 ... 31.400000 None \n", "2 29.200001 None 29.200001 ... 17.600000 None \n", "3 16.100000 B 16.100000 ... 17.400000 B \n", "4 16.299999 B 16.299999 ... 13.500000 B \n", "\n", " FLOW29 FLOW_SYMBOL29 FLOW30 FLOW_SYMBOL30 FLOW31 \\\n", "0 29.700001 E 29.700001 E 29.700001 \n", "1 31.400000 None 29.200001 None NaN \n", "2 17.600000 None 17.600000 None 17.600000 \n", "3 17.400000 B 17.400000 B 17.400000 \n", "4 NaN None NaN None NaN \n", "\n", " FLOW_SYMBOL31 MIN MAX \n", "0 E 15.300000 29.700001 \n", "1 None 29.200001 34.000000 \n", "2 None 17.600000 29.200001 \n", "3 B 16.100000 17.400000 \n", "4 None 13.500000 16.299999 \n", "\n", "[5 rows x 67 columns]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "query='''\n", "SELECT\n", " F.STATION_NUMBER\n", " ,YEAR\n", " ,MONTH\n", " ,FLOW1\n", " ,FLOW_SYMBOL1\n", " ,FLOW2\n", " ,FLOW_SYMBOL2\n", " ,FLOW3\n", " ,FLOW_SYMBOL3\n", " ,FLOW4\n", " ,FLOW_SYMBOL4\n", " ,FLOW5\n", " ,FLOW_SYMBOL5\n", " ,FLOW6\n", " ,FLOW_SYMBOL6\n", " ,FLOW7\n", " ,FLOW_SYMBOL7\n", " ,FLOW8\n", " ,FLOW_SYMBOL8\n", " ,FLOW9\n", " ,FLOW_SYMBOL9\n", " ,FLOW10\n", " ,FLOW_SYMBOL10\n", " ,FLOW11\n", " ,FLOW_SYMBOL11\n", " ,FLOW12\n", " ,FLOW_SYMBOL12\n", " ,FLOW13\n", " ,FLOW_SYMBOL13\n", " ,FLOW14\n", " ,FLOW_SYMBOL14\n", " ,FLOW15\n", " ,FLOW_SYMBOL15\n", " ,FLOW16\n", " ,FLOW_SYMBOL16\n", " ,FLOW17\n", " ,FLOW_SYMBOL17\n", " ,FLOW18\n", " ,FLOW_SYMBOL18\n", " ,FLOW19\n", " ,FLOW_SYMBOL19\n", " ,FLOW20\n", " ,FLOW_SYMBOL20\n", " ,FLOW21\n", " ,FLOW_SYMBOL21\n", " ,FLOW22\n", " ,FLOW_SYMBOL22\n", " ,FLOW23\n", " ,FLOW_SYMBOL23\n", " ,FLOW24\n", " ,FLOW_SYMBOL24\n", " ,FLOW25\n", " ,FLOW_SYMBOL25\n", " ,FLOW26\n", " ,FLOW_SYMBOL26\n", " ,FLOW27\n", " ,FLOW_SYMBOL27\n", " ,FLOW28\n", " ,FLOW_SYMBOL28\n", " ,FLOW29\n", " ,FLOW_SYMBOL29\n", " ,FLOW30\n", " ,FLOW_SYMBOL30\n", " ,FLOW31\n", " ,FLOW_SYMBOL31\n", " ,MIN\n", " ,MAX\n", "FROM DLY_FLOWS F\n", "INNER JOIN STATIONS S\n", " ON F.STATION_NUMBER = S.STATION_NUMBER\n", "'''\n", "df_hydat = hydat_connection.execute_query(query)\n", "df_hydat.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Choose a subset of data" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2018-03-11T05:23:12.659053Z", "start_time": "2018-03-11T05:23:12.388011Z" }, "collapsed": true }, "outputs": [], "source": [ "df_hydat = df_hydat.head(1000)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Break table (Days, Symbols)\n", "\n", "Each day has a flow value along with a symbol. The transformation un-pivots the source data based on daily flow and symbol. This is implemented below and a preview of the results are provided." ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2018-03-11T05:23:12.869161Z", "start_time": "2018-03-11T05:23:12.661055Z" }, "collapsed": true }, "outputs": [], "source": [ "basecolumn = ['STATION_NUMBER', 'YEAR', 'MONTH', 'MIN', 'MAX']\n", "column_flow_day = basecolumn+['FLOW%i'%(i) for i in range(1,32)]\n", "column_symbol_day= basecolumn+['FLOW_SYMBOL%i'%(i) for i in range(1,32)]\n", "\n", "df_hydat_flow = df_hydat[column_flow_day]\n", "df_hydat_symbol = df_hydat[column_symbol_day]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Unpivot Days (melt)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2018-03-11T05:23:13.094857Z", "start_time": "2018-03-11T05:23:12.871164Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
STATION_NUMBERYEARMONTHMINMAXDayFlowValue
001AD00119281015.30000029.700001FLOW116.600000
101AD00119281129.20000134.000000FLOW129.700001
201AD00119281217.60000029.200001FLOW129.200001
301AD0011929116.10000017.400000FLOW116.100000
401AD0011929213.50000016.299999FLOW116.299999
\n", "
" ], "text/plain": [ " STATION_NUMBER YEAR MONTH MIN MAX Day FlowValue\n", "0 01AD001 1928 10 15.300000 29.700001 FLOW1 16.600000\n", "1 01AD001 1928 11 29.200001 34.000000 FLOW1 29.700001\n", "2 01AD001 1928 12 17.600000 29.200001 FLOW1 29.200001\n", "3 01AD001 1929 1 16.100000 17.400000 FLOW1 16.100000\n", "4 01AD001 1929 2 13.500000 16.299999 FLOW1 16.299999" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_hydat_flow_melt = pd.melt(df_hydat_flow,\n", " id_vars=['STATION_NUMBER', 'YEAR', 'MONTH', 'MIN', 'MAX'],\n", " var_name = 'Day',\n", " value_name=\"FlowValue\")\n", "df_hydat_flow_melt.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Unpivot Symbols" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2018-03-11T05:23:13.332832Z", "start_time": "2018-03-11T05:23:13.097860Z" }, "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
STATION_NUMBERYEARMONTHMINMAXFlowSymbolSymbolDay
001AD00119281015.30000029.700001FLOW_SYMBOL1EFLOW1
101AD00119281129.20000134.000000FLOW_SYMBOL1EFLOW1
201AD00119281217.60000029.200001FLOW_SYMBOL1NoneFLOW1
301AD0011929116.10000017.400000FLOW_SYMBOL1BFLOW1
401AD0011929213.50000016.299999FLOW_SYMBOL1BFLOW1
\n", "
" ], "text/plain": [ " STATION_NUMBER YEAR MONTH MIN MAX FlowSymbol Symbol \\\n", "0 01AD001 1928 10 15.300000 29.700001 FLOW_SYMBOL1 E \n", "1 01AD001 1928 11 29.200001 34.000000 FLOW_SYMBOL1 E \n", "2 01AD001 1928 12 17.600000 29.200001 FLOW_SYMBOL1 None \n", "3 01AD001 1929 1 16.100000 17.400000 FLOW_SYMBOL1 B \n", "4 01AD001 1929 2 13.500000 16.299999 FLOW_SYMBOL1 B \n", "\n", " Day \n", "0 FLOW1 \n", "1 FLOW1 \n", "2 FLOW1 \n", "3 FLOW1 \n", "4 FLOW1 " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_hydat_symbol_melt = pd.melt(df_hydat_symbol,\n", " id_vars=['STATION_NUMBER', 'YEAR', 'MONTH', 'MIN', 'MAX'],\n", " var_name = 'FlowSymbol',\n", " value_name=\"Symbol\")\n", "df_hydat_symbol_melt['Day'] = df_hydat_symbol_melt['FlowSymbol'].apply(lambda s: 'FLOW'+s.split('FLOW_SYMBOL')[1])\n", "df_hydat_symbol_melt.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Join tables\n", "\n", "The un-pivoted tables for daily flow and flow symbol are merged/joined together and the extra columns are excluded." ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2018-03-11T05:23:13.578508Z", "start_time": "2018-03-11T05:23:13.335834Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
STATION_NUMBERYEARMONTHMINMAXDayFlowValueMIN_MAX_FlowSymbolSymbol
001AD00119281015.30000029.700001FLOW116.60000015.30000029.700001FLOW_SYMBOL1E
101AD00119281129.20000134.000000FLOW129.70000129.20000134.000000FLOW_SYMBOL1E
201AD00119281217.60000029.200001FLOW129.20000117.60000029.200001FLOW_SYMBOL1None
301AD0011929116.10000017.400000FLOW116.10000016.10000017.400000FLOW_SYMBOL1B
401AD0011929213.50000016.299999FLOW116.29999913.50000016.299999FLOW_SYMBOL1B
\n", "
" ], "text/plain": [ " STATION_NUMBER YEAR MONTH MIN MAX Day FlowValue \\\n", "0 01AD001 1928 10 15.300000 29.700001 FLOW1 16.600000 \n", "1 01AD001 1928 11 29.200001 34.000000 FLOW1 29.700001 \n", "2 01AD001 1928 12 17.600000 29.200001 FLOW1 29.200001 \n", "3 01AD001 1929 1 16.100000 17.400000 FLOW1 16.100000 \n", "4 01AD001 1929 2 13.500000 16.299999 FLOW1 16.299999 \n", "\n", " MIN_ MAX_ FlowSymbol Symbol \n", "0 15.300000 29.700001 FLOW_SYMBOL1 E \n", "1 29.200001 34.000000 FLOW_SYMBOL1 E \n", "2 17.600000 29.200001 FLOW_SYMBOL1 None \n", "3 16.100000 17.400000 FLOW_SYMBOL1 B \n", "4 13.500000 16.299999 FLOW_SYMBOL1 B " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hydat_initial_join = df_hydat_flow_melt.merge(df_hydat_symbol_melt,\n", " left_on=['STATION_NUMBER','YEAR', 'MONTH', 'Day'],\n", " right_on=['STATION_NUMBER','YEAR', 'MONTH', 'Day'],\n", " how='inner',suffixes=('', '_'))\n", "hydat_initial_join.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Drop/rename columns" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "ExecuteTime": { "end_time": "2018-03-11T05:23:13.785657Z", "start_time": "2018-03-11T05:23:13.580511Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
STATION_NUMBERYEARMONTHMINMAXDayFlowValueSymbol
001AD00119281015.30000029.700001FLOW116.600000E
101AD00119281129.20000134.000000FLOW129.700001E
201AD00119281217.60000029.200001FLOW129.200001None
301AD0011929116.10000017.400000FLOW116.100000B
401AD0011929213.50000016.299999FLOW116.299999B
\n", "
" ], "text/plain": [ " STATION_NUMBER YEAR MONTH MIN MAX Day FlowValue Symbol\n", "0 01AD001 1928 10 15.300000 29.700001 FLOW1 16.600000 E\n", "1 01AD001 1928 11 29.200001 34.000000 FLOW1 29.700001 E\n", "2 01AD001 1928 12 17.600000 29.200001 FLOW1 29.200001 None\n", "3 01AD001 1929 1 16.100000 17.400000 FLOW1 16.100000 B\n", "4 01AD001 1929 2 13.500000 16.299999 FLOW1 16.299999 B" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hydat_initial_join.drop(['MIN_', 'MAX_','FlowSymbol'], axis=1, inplace=True)\n", "hydat_initial_join.head()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "ExecuteTime": { "end_time": "2018-03-11T05:23:14.018040Z", "start_time": "2018-03-11T05:23:13.787660Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
STATION_NUMBERYEARMONTHMINMAXDayFlowValueSymbol
001AD00119281015.30000029.700001116.600000E
101AD00119281129.20000134.000000129.700001E
201AD00119281217.60000029.200001129.200001None
301AD0011929116.10000017.400000116.100000B
401AD0011929213.50000016.299999116.299999B
\n", "
" ], "text/plain": [ " STATION_NUMBER YEAR MONTH MIN MAX Day FlowValue Symbol\n", "0 01AD001 1928 10 15.300000 29.700001 1 16.600000 E\n", "1 01AD001 1928 11 29.200001 34.000000 1 29.700001 E\n", "2 01AD001 1928 12 17.600000 29.200001 1 29.200001 None\n", "3 01AD001 1929 1 16.100000 17.400000 1 16.100000 B\n", "4 01AD001 1929 2 13.500000 16.299999 1 16.299999 B" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hydat_initial_join['Day'] = hydat_initial_join['Day'].apply( lambda d: int( d.split('FLOW')[1] ) )\n", "hydat_initial_join.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Adding date\n", "\n", "Year, month and day columns are combined to provide a date column that can be used for data analysis." ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "ExecuteTime": { "end_time": "2018-03-11T05:23:15.910931Z", "start_time": "2018-03-11T05:23:14.020043Z" }, "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
STATION_NUMBERYEARMONTHMINMAXDayFlowValueSymbolDateKey
001AD00119281015.30000029.700001116.600000E19281001
101AD00119281129.20000134.000000129.700001E19281101
201AD00119281217.60000029.200001129.200001None19281201
301AD0011929116.10000017.400000116.100000B19290101
401AD0011929213.50000016.299999116.299999B19290201
\n", "
" ], "text/plain": [ " STATION_NUMBER YEAR MONTH MIN MAX Day FlowValue Symbol \\\n", "0 01AD001 1928 10 15.300000 29.700001 1 16.600000 E \n", "1 01AD001 1928 11 29.200001 34.000000 1 29.700001 E \n", "2 01AD001 1928 12 17.600000 29.200001 1 29.200001 None \n", "3 01AD001 1929 1 16.100000 17.400000 1 16.100000 B \n", "4 01AD001 1929 2 13.500000 16.299999 1 16.299999 B \n", "\n", " DateKey \n", "0 19281001 \n", "1 19281101 \n", "2 19281201 \n", "3 19290101 \n", "4 19290201 " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hydat_initial_join['DateKey'] = hydat_initial_join[['YEAR', 'MONTH', 'Day']].apply(lambda x : \"{}{}{}\".format(x['YEAR'], str(x['MONTH']).zfill(2), str(x['Day']).zfill(2)), axis=1)\n", "hydat_initial_join.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Export to CSV\n", "\n", "The final result of the transformations for hydat data is written to a csv file." ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "ExecuteTime": { "end_time": "2018-03-11T05:23:16.472148Z", "start_time": "2018-03-11T05:23:15.913933Z" }, "collapsed": true }, "outputs": [], "source": [ "PT.write_csv(hydat_initial_join,flname='Hydat.csv', chunksize=10000)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Write to SQL server" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Initialize a connection" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "ExecuteTime": { "end_time": "2018-03-11T05:23:16.761733Z", "start_time": "2018-03-11T05:23:16.472148Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Successful Connection\n" ] } ], "source": [ "connection = PT.ServerConnect(server='DESKTOP-AN4AQCT\\SQLEXPRESS', database='Dev')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Get server version" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "ExecuteTime": { "end_time": "2018-03-11T05:23:16.980893Z", "start_time": "2018-03-11T05:23:16.763735Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Microsoft SQL Server 2016 (SP1-GDR) (KB4019089) - 13.0.4206.0 (X64) \n", "\tJul 6 2017 07:55:03 \n", "\tCopyright (c) Microsoft Corporation\n", "\tExpress Edition (64-bit) on Windows 10 Home 6.3 (Build 16299: )\n", "\n" ] } ], "source": [ "print(connection.get_server_version())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Create a new table" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "ExecuteTime": { "end_time": "2018-03-11T05:23:17.203065Z", "start_time": "2018-03-11T05:23:16.982895Z" }, "collapsed": true }, "outputs": [], "source": [ "cmd = '''\n", "DROP TABLE IF EXISTS [{schema}].[{TableName}]\n", "CREATE TABLE [{schema}].[{TableName}]\n", "(\n", " [RowId] INT IDENTITY (1, 1) NOT NULL,\n", " [STATION_NUMBER] NVARCHAR(10) Null,\n", " [YEAR] INT Null,\n", " [MONTH] INT Null,\n", " [MIN] DECIMAL(19,5) Null,\n", " [MAX] DECIMAL(19,5) Null,\n", " [Day] INT Null,\n", " [FlowValue] DECIMAL(19,5) Null,\n", " [Symbol] NVARCHAR(5) Null,\n", " [DateKey] INT Null,\n", ")\n", "\n", "CREATE CLUSTERED COLUMNSTORE INDEX IX_Hydat_DailyFlow ON [{schema}].[{TableName}]\n", "\n", "\n", "'''.format(schema = 'dbo', TableName='Hydat_FlowData')\n", "connection.deploy(cmd)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Write to SQL Server" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "ExecuteTime": { "end_time": "2018-03-11T05:23:22.778804Z", "start_time": "2018-03-11T05:23:17.205065Z" } }, "outputs": [], "source": [ "PT.write_to_sql(hydat_initial_join,table_name='Hydat_FlowData',if_exists='append', schema='dbo',\n", " connection_string=connection.connection_string)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Read Back From Server" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "ExecuteTime": { "end_time": "2018-03-11T05:23:23.483822Z", "start_time": "2018-03-11T05:23:22.778804Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RowIdSTATION_NUMBERYEARMONTHMINMAXDayFlowValueSymbolDateKey
0101AD00119281015.329.7116.6E19281001
1201AD00119281129.234.0129.7E19281101
2301AD00119281217.629.2129.2None19281201
3401AD0011929116.117.4116.1B19290101
4501AD0011929213.516.3116.3B19290201
\n", "
" ], "text/plain": [ " RowId STATION_NUMBER YEAR MONTH MIN MAX Day FlowValue Symbol \\\n", "0 1 01AD001 1928 10 15.3 29.7 1 16.6 E \n", "1 2 01AD001 1928 11 29.2 34.0 1 29.7 E \n", "2 3 01AD001 1928 12 17.6 29.2 1 29.2 None \n", "3 4 01AD001 1929 1 16.1 17.4 1 16.1 B \n", "4 5 01AD001 1929 2 13.5 16.3 1 16.3 B \n", "\n", " DateKey \n", "0 19281001 \n", "1 19281101 \n", "2 19281201 \n", "3 19290101 \n", "4 19290201 " ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "query ='''\n", "SELECT *\n", "FROM [{schema}].[{TableName}]\n", "'''.format(schema = 'dbo', TableName='Hydat_FlowData')\n", "DF_Hydat = connection.execute_query(query)\n", "DF_Hydat.head()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "ExecuteTime": { "end_time": "2018-03-11T05:23:23.663449Z", "start_time": "2018-03-11T05:23:23.485826Z" }, "collapsed": true }, "outputs": [], "source": [ "connection.close()" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "### Write to sqlite database" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "ExecuteTime": { "end_time": "2018-03-11T05:23:23.959769Z", "start_time": "2018-03-11T05:23:23.665451Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Sqliet version: 2.6.0\n", "Successful Connection\n" ] } ], "source": [ "sqlite_connection = PT.Sqlite3Connect('Data/Hydat_Transformed.db')" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "ExecuteTime": { "end_time": "2018-03-11T05:23:24.140916Z", "start_time": "2018-03-11T05:23:23.961770Z" } }, "outputs": [], "source": [ "cmd1 = '''DROP TABLE IF EXISTS {schema}.{TableName};'''.format(schema='main', TableName='Hydat_FlowData')\n", "cmd2 = '''\n", "CREATE TABLE \n", "IF NOT EXISTS {schema}.{TableName}(\n", " [STATION_NUMBER] NVARCHAR(10) Null,\n", " [YEAR] INT Null,\n", " [MONTH] INT Null,\n", " [MIN] DECIMAL(19,5) Null,\n", " [MAX] DECIMAL(19,5) Null,\n", " [Day] INT Null,\n", " [FlowValue] DECIMAL(19,5) Null,\n", " [Symbol] NVARCHAR(5) Null,\n", " [DateKey] INT Null\n", ");\n", "'''.format(schema='main', TableName='Hydat_FlowData')" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "ExecuteTime": { "end_time": "2018-03-11T05:23:24.575669Z", "start_time": "2018-03-11T05:23:24.142920Z" } }, "outputs": [], "source": [ "sqlite_connection.deploy(cmd1)\n", "sqlite_connection.deploy(cmd2)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "ExecuteTime": { "end_time": "2018-03-11T05:23:25.455274Z", "start_time": "2018-03-11T05:23:24.580672Z" } }, "outputs": [], "source": [ "PT.write_to_sql(hydat_initial_join,\n", " table_name='Hydat_FlowData',\n", " if_exists='append',\n", " schema='main',\n", " driver='sqlite',\n", " index=False,\n", " connection_string=sqlite_connection.database)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "ExecuteTime": { "end_time": "2018-03-11T05:23:25.776191Z", "start_time": "2018-03-11T05:23:25.455274Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
STATION_NUMBERYEARMONTHMINMAXDayFlowValueSymbolDateKey
001AD00119281015.30000029.700001116.600000E19281001
101AD00119281129.20000134.000000129.700001E19281101
201AD00119281217.60000029.200001129.200001None19281201
301AD0011929116.10000017.400000116.100000B19290101
401AD0011929213.50000016.299999116.299999B19290201
\n", "
" ], "text/plain": [ " STATION_NUMBER YEAR MONTH MIN MAX Day FlowValue Symbol \\\n", "0 01AD001 1928 10 15.300000 29.700001 1 16.600000 E \n", "1 01AD001 1928 11 29.200001 34.000000 1 29.700001 E \n", "2 01AD001 1928 12 17.600000 29.200001 1 29.200001 None \n", "3 01AD001 1929 1 16.100000 17.400000 1 16.100000 B \n", "4 01AD001 1929 2 13.500000 16.299999 1 16.299999 B \n", "\n", " DateKey \n", "0 19281001 \n", "1 19281101 \n", "2 19281201 \n", "3 19290101 \n", "4 19290201 " ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sqlite_connection.execute_query('SELECT * FROM main.Hydat_FlowData').head()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "ExecuteTime": { "end_time": "2018-03-11T05:23:25.961118Z", "start_time": "2018-03-11T05:23:25.778192Z" }, "collapsed": true }, "outputs": [], "source": [ "sqlite_connection.close()" ] } ], "metadata": { "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" }, "toc": { "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "toc_cell": false, "toc_position": { "height": "837px", "left": "0px", "right": "1468px", "top": "107px", "width": "212px" }, "toc_section_display": "block", "toc_window_display": false }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "position": { "height": "384px", "left": "838px", "right": "20px", "top": "166px", "width": "710px" }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 2 }