{ "cells": [ { "cell_type": "markdown", "metadata": { "extensions": { "jupyter_dashboards": { "version": 1, "views": { "grid_default": { "col": 0, "height": 4, "hidden": false, "row": 0, "width": 4 }, "report_default": {} } } }, "hideCode": true, "hidePrompt": true }, "source": [ "# Import data from the new DIMM into ElasticSearch" ] }, { "cell_type": "markdown", "metadata": { "extensions": { "jupyter_dashboards": { "version": 1, "views": { "grid_default": { "col": 4, "height": 4, "hidden": false, "row": 0, "width": 4 }, "report_default": {} } } }, "hideCode": false, "hidePrompt": false }, "source": [ "We load the modules and other variables. The data_slodar.txt has to be cleaned up since it contains towards the end some text which cannot be parsed." ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2017-05-26T16:36:47.605236", "start_time": "2017-05-26T16:36:47.595025" }, "collapsed": true, "extensions": { "jupyter_dashboards": { "version": 1, "views": { "grid_default": { "hidden": true }, "report_default": {} } } }, "hideCode": false, "hidePrompt": false }, "outputs": [], "source": [ "from elasticsearch import Elasticsearch\n", "import pandas as pd\n", "import os\n", "import numpy as np\n", "import subprocess\n", "path_data = '/data/datalake/asm'\n", "new_dimm_filename = 'data_new_dimm.csv'" ] }, { "cell_type": "markdown", "metadata": { "extensions": { "jupyter_dashboards": { "version": 1, "views": { "grid_default": { "col": 8, "height": 4, "hidden": false, "row": 0, "width": 4 }, "report_default": {} } } }, "hideCode": false, "hidePrompt": false }, "source": [ "We init the elastic search. Instead of servername, just insert the name of your server." ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2017-05-26T16:36:48.584409", "start_time": "2017-05-26T16:36:48.578669" }, "collapsed": true, "extensions": { "jupyter_dashboards": { "version": 1, "views": { "grid_default": { "hidden": true }, "report_default": {} } } }, "hideCode": false, "hidePrompt": false }, "outputs": [], "source": [ "es = Elasticsearch('http://servername:9200', timeout=20.0, bulk_size=100000)" ] }, { "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2017-05-26T16:36:41.248938", "start_time": "2017-05-26T16:36:41.238927" } }, "source": [ "We perform the query" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2017-05-26T16:39:38.286125", "start_time": "2017-05-26T16:39:34.718856" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "wget -O /data/datalake/asm/data_new_dimm.txt http://archive.eso.org/wdb/wdb/asm/dimm_paranal/query?wdbo=csv&start_date=2017-04-28T00:00:00.00..2017-05-01T12:00:00.00&tab_fwhm=1&tab_rfl=0&tab_rfl_time=0&top=1000000\n", "--2017-05-26 16:39:34-- http://archive.eso.org/wdb/wdb/asm/dimm_paranal/query?wdbo=csv&start_date=2017-04-28T00:00:00.00..2017-05-01T12:00:00.00&tab_fwhm=1&tab_rfl=0&tab_rfl_time=0&top=1000000\n", "Resolving archive.eso.org (archive.eso.org)... 134.171.46.246\n", "Connecting to archive.eso.org (archive.eso.org)|134.171.46.246|:80... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: unspecified [text/plain]\n", "Saving to: ‘/data/datalake/asm/data_new_dimm.txt’\n", "\n", " 0K .......... .......... .......... .......... .......... 19.0K\n", " 50K 42.4K=2.6s\n", "\n", "2017-05-26 16:39:38 (19.1 KB/s) - ‘/data/datalake/asm/data_new_dimm.txt’ saved [51693]\n", "\n", "\n" ] } ], "source": [ "start_date_asm_str='2017-04-28T00:00:00.00'\n", "end_date_asm_str ='2017-05-01T12:00:00.00'\n", "request_asm_str = ['wget','-O',os.path.join(path_data,new_dimm_filename),\\\n", " 'http://archive.eso.org/wdb/wdb/asm/dimm_paranal/query?wdbo=csv&start_date={0:s}..{1:s}&tab_fwhm=1&tab_rfl=0&tab_rfl_time=0&top=1000000'.format(\\\n", " start_date_asm_str,end_date_asm_str)]\n", "output,error = subprocess.Popen(request_asm_str,stdout=subprocess.PIPE,stderr=subprocess.STDOUT).communicate()\n", "print(' '.join(request_asm_str))\n", "print(output.decode('UTF8'))" ] }, { "cell_type": "markdown", "metadata": { "hideCode": false, "hidePrompt": false }, "source": [ "We read the csv files as a panda array." ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "ExecuteTime": { "end_time": "2017-05-26T16:39:39.307410", "start_time": "2017-05-26T16:39:39.283089" }, "extensions": { "jupyter_dashboards": { "version": 1, "views": { "grid_default": { "hidden": true }, "report_default": {} } } }, "hideCode": false, "hidePrompt": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1984\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/opt/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:1: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support skipfooter; you can avoid this warning by specifying engine='python'.\n", " if __name__ == '__main__':\n" ] } ], "source": [ "new_dimm_df = pd.read_csv(os.path.join(path_data,new_dimm_filename),skiprows=1,skipfooter=5)\n", "print(len(new_dimm_df))" ] }, { "cell_type": "markdown", "metadata": { "extensions": { "jupyter_dashboards": { "version": 1, "views": { "grid_default": { "col": 0, "height": 4, "hidden": false, "row": 4, "width": 4 }, "report_default": {} } } }, "hideCode": false, "hidePrompt": false }, "source": [ "Let's see how it looks like" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "ExecuteTime": { "end_time": "2017-05-26T16:39:41.645258", "start_time": "2017-05-26T16:39:41.626649" }, "extensions": { "jupyter_dashboards": { "version": 1, "views": { "grid_default": { "col": 4, "height": 13, "hidden": false, "row": 4, "width": 4 }, "report_default": {} } } }, "hideCode": false, "hidePrompt": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Date timeDIMM Seeing [\"]
02017-04-28T00:01:010.401
12017-04-28T00:02:200.412
22017-04-28T00:03:400.457
32017-04-28T00:04:590.496
42017-04-28T00:06:180.493
\n", "
" ], "text/plain": [ " Date time DIMM Seeing [\"]\n", "0 2017-04-28T00:01:01 0.401\n", "1 2017-04-28T00:02:20 0.412\n", "2 2017-04-28T00:03:40 0.457\n", "3 2017-04-28T00:04:59 0.496\n", "4 2017-04-28T00:06:18 0.493" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_dimm_df.head()" ] }, { "cell_type": "markdown", "metadata": { "extensions": { "jupyter_dashboards": { "version": 1, "views": { "grid_default": { "col": 0, "height": 4, "hidden": false, "row": 12, "width": 4 }, "report_default": {} } } }, "hideCode": false, "hidePrompt": false }, "source": [ "We change the key names and create a dictionnary based on the data frame." ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "ExecuteTime": { "end_time": "2017-05-25T20:52:16.382709", "start_time": "2017-05-25T20:52:14.147532" }, "extensions": { "jupyter_dashboards": { "version": 1, "views": { "grid_default": { "col": 8, "height": 4, "hidden": false, "row": 12, "width": 4 }, "report_default": {} } } }, "hideCode": false, "hidePrompt": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "128166\n", "{'@timestamp': '2016-04-05T23:50:34', 'mass_GLfrac': 0.47100000000000003, 'mass_dimm_tau0': 0.002679, 'mass_tau0_RMS': 0.023, 'mass_tau0': 0.003039}\n" ] } ], "source": [ "new_dimm_df.rename(columns={'Date time': '@timestamp',\\\n", " 'DIMM Seeing [\"]':'dimm_seeing'}, inplace=True)\n", "new_dimm_dict =new_dimm_df.to_dict(orient='records') # this is a list of dict\n", "print(len(new_dimm_dict))\n", "print(new_dimm_dict[0])" ] }, { "cell_type": "markdown", "metadata": { "extensions": { "jupyter_dashboards": { "version": 1, "views": { "grid_default": { "col": 8, "height": 4, "hidden": false, "row": 8, "width": 4 }, "report_default": {} } } }, "hideCode": false, "hidePrompt": false }, "source": [ "We also need now to filter all the NaN values" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "ExecuteTime": { "end_time": "2017-05-25T20:52:25.079406", "start_time": "2017-05-25T20:52:25.054125" }, "collapsed": true, "extensions": { "jupyter_dashboards": { "version": 1, "views": { "grid_default": { "col": 8, "height": 4, "hidden": false, "row": 4, "width": 4 }, "report_default": {} } } }, "hideCode": false, "hidePrompt": false }, "outputs": [], "source": [ "def clean_dico(dico,inline=True):\n", " \"\"\"\n", " Function that cleans a dictionary from nan vales by removing the entire key in case it encounters a nan value\n", " Input:\n", " - dico: the dictionary to clean\n", " - inline: boolean to specify whether a copy of the dictionary is to be returned or if the cleaning is done inline.\n", " \"\"\"\n", " keys_to_remove = []\n", " for key in dico.keys(): \n", " try:\n", " if np.any(np.isnan(dico[key])):\n", " keys_to_remove.append(key)\n", " except TypeError:\n", " continue\n", " if inline:\n", " new_dico = dico\n", " else:\n", " new_dico = dico.copy()\n", " for key in keys_to_remove:\n", " new_dico.pop(key)\n", " return new_dico\n", "\n", "def clean_dico_list(dico_list,inline=True):\n", " \"\"\"\n", " Function that cleans a list of dictionaries by calling the clean_dico method.\n", " Input:\n", " - dico_list: the list of dictionaries to clean\n", " - inline: boolean to specify whether a copy of the list is to be returned or if the cleaning is done inline. \n", " \"\"\"\n", " cleaned_dico_list = []\n", " for dico in dico_list:\n", " cleaned_dico_list.append(clean_dico(dico,inline=inline))\n", " return cleaned_dico_list" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "ExecuteTime": { "end_time": "2017-05-25T20:52:34.269741", "start_time": "2017-05-25T20:52:25.922862" }, "collapsed": true, "extensions": { "jupyter_dashboards": { "version": 1, "views": { "grid_default": { "col": 0, "height": 4, "hidden": false, "row": 8, "width": 4 }, "report_default": {} } } }, "hideCode": false, "hidePrompt": false }, "outputs": [], "source": [ "new_dimm_dict = clean_dico_list(new_dimm_dict)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We insert the entries in the elastic search data base" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "ExecuteTime": { "end_time": "2017-05-25T20:52:42.391245", "start_time": "2017-05-25T20:52:37.495470" }, "hideCode": false, "hidePrompt": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Inserting 0th document to the asm slodar elastic search...\n", "Inserting 300th document to the asm slodar elastic search...\n", "Inserting 600th document to the asm slodar elastic search...\n", "Inserting 900th document to the asm slodar elastic search...\n" ] } ], "source": [ "for i,doc in enumerate(new_dimm_dict):\n", " try:\n", " res = es.index(index='asm', doc_type='dimm', body= doc)\n", " except Exception as e:\n", " print('Error with document number {0:d}'.format(i))\n", " print('Th error occured while trying to insert the following dictionary:')\n", " print(doc)\n", " print(e)\n", " if np.mod(i,300)==0:\n", " print('Inserting {0:d}th document to the asm new dimm elastic search...'.format(i))" ] } ], "metadata": { "anaconda-cloud": {}, "extensions": { "jupyter_dashboards": { "activeView": "grid_default", "version": 1, "views": { "grid_default": { "cellMargin": 10, "defaultCellHeight": 20, "maxColumns": 12, "name": "grid", "type": "grid" }, "report_default": { "name": "report", "type": "report" } } } }, "hide_code_all_hidden": false, "hide_input": false, "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" }, "latex_envs": { "LaTeX_envs_menu_present": true, "autocomplete": true, "bibliofile": "biblio.bib", "cite_by": "apalike", "current_citInitial": 1, "eqLabelWithNumbers": true, "eqNumInitial": 1, "hotkeys": { "equation": "Ctrl-E", "itemize": "Ctrl-I" }, "labels_anchors": false, "latex_user_defs": false, "report_style_numbering": false, "user_envs_cfg": false }, "toc": { "colors": { "hover_highlight": "#DAA520", "running_highlight": "#FF0000", "selected_highlight": "#FFD700" }, "moveMenuLeft": true, "nav_menu": { "height": "45px", "width": "252px" }, "navigate_menu": true, "number_sections": true, "sideBar": true, "threshold": 4, "toc_cell": false, "toc_section_display": "block", "toc_window_display": false, "widenNotebook": false } }, "nbformat": 4, "nbformat_minor": 2 }