[GC-118] Blueprint to load NER data into elasticsearch.

gretelai · Oct 29, 2020 · c80fc7b · c80fc7b
1 parent 2ee9a21
commit c80fc7b
Show file tree

Hide file tree

Showing 4 changed files with 239 additions and 0 deletions.
diff --git a/gretel/gc-load_ner_to_elasticsearch/README.md b/gretel/gc-load_ner_to_elasticsearch/README.md
@@ -0,0 +1,14 @@
+## Load Gretel NER data into Elasticsearch
+
+Elasticsearch and Kibana provide an industry standard platform for getting a quick start on exploring your data,
+building dashboards and running complex queries to search for records of interest.  This blueprint
+looks at using these tools to examine records enriched with Gretel NER labels.
+
+In this blueprint we show how to use the Gretel python client to bootstrap a project
+with sample data, load the NER results into Elasticsearch and inspect the results 
+via programmatic queries.  
+
+For this blueprint notebook you will need your Gretel API Key.  You can find this in the Gretel Console
+under the Profile menu or at https://console.gretel.cloud/users/me/key.  You will also need to have
+Elasticsearch and Kibana running on localhost.  The notebook sets up a local cluster with docker.
+
diff --git a/gretel/gc-load_ner_to_elasticsearch/blueprint.ipynb b/gretel/gc-load_ner_to_elasticsearch/blueprint.ipynb
@@ -0,0 +1,177 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Load NER data into Elasticsearch\n",
+    "\n",
+    "The Gretel Console (https://console.gretel.cloud) provides several views and tools to help explore your data.  For those who cannot or do not wish to use the Console, another option is to load the data into Elasticsearch for exploration and reporting.  This blueprint uses Gretel sample data to bootstrap a new project and shows a simple workflow for loading that data into an Elasticsearch cluster.  We run a sample query to verify that records were loaded and show how to look for an NER label in your records."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!docker-compose up -d"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "\n",
+    "import sys\n",
+    "\n",
+    "# Elasticsearch client version should match cluster version.  See https://elasticsearch-py.readthedocs.io/en/master/#compatibility\n",
+    "!{sys.executable} -m pip install -U gretel-client elasticsearch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Be sure to use your Gretel API key here, which is available from the Profile menu in the Console\n",
+    "\n",
+    "import getpass\n",
+    "import os\n",
+    "\n",
+    "gretel_api_key = os.getenv(\"GRETEL_API_KEY\") or getpass.getpass(\"Your Gretel API Key\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Install Gretel SDKs and bootstrap the project\n",
+    "\n",
+    "from gretel_client import project_from_uri, get_cloud_client\n",
+    "\n",
+    "client = get_cloud_client(\"api\", gretel_api_key)\n",
+    "client.install_packages()\n",
+    "project = client.get_project(create=True)\n",
+    "\n",
+    "project.send_bulk(client.get_sample('bike-customer-orders'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from copy import deepcopy\n",
+    "\n",
+    "index_name = \"gretel_ner_blueprint\"\n",
+    "\n",
+    "def trim_record(record):\n",
+    "    trim = {}\n",
+    "    # Keep the original record\n",
+    "    trim['record'] = deepcopy(record['record'])\n",
+    "    # Keep ingest_time for time series\n",
+    "    trim['ingest_time'] = record['ingest_time']\n",
+    "    # Keep just score_* lists for simplicity\n",
+    "    trim['score_high'] = record['metadata']['entities']['score_high']\n",
+    "    trim['score_med'] = record['metadata']['entities']['score_med']\n",
+    "    trim['score_low'] = record['metadata']['entities']['score_low']\n",
+    "    # Specify the Elasticsearch index for the record.\n",
+    "    trim['_index'] = index_name\n",
+    "    return trim\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from elasticsearch import Elasticsearch\n",
+    "from elasticsearch.helpers import bulk\n",
+    "\n",
+    "elasticsearch_host = 'localhost'\n",
+    "elasticsearch_port = 9200\n",
+    "es = Elasticsearch(\n",
+    "    hosts=[{'host': elasticsearch_host, 'port': elasticsearch_port}]\n",
+    ")\n",
+    "\n",
+    "bulk(\n",
+    "    es, \n",
+    "    project.iter_records(\n",
+    "        # NOTE that default direction is \"forward\".  Client will block and wait for new records.\n",
+    "        direction=\"backward\",\n",
+    "        post_process=trim_record, \n",
+    "        params={\"flatten\": \"no\"}))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Find records tagged with the NER 'location' label.  They must also be in London with 3 or more cars.\n",
+    "# Include an aggregation showing the number of children.\n",
+    "aggregation_query = {\n",
+    "    \"query\": {\n",
+    "        \"bool\": {\n",
+    "            \"must\": [\n",
+    "                {\"match\": {\"record.City\": \"London\"}}, \n",
+    "                {\"range\": {\"record.NumberCarsOwned\": {\"gte\": 3}}},\n",
+    "                {\"match\": {\"score_high\": \"location\"}}\n",
+    "            ]\n",
+    "        }\n",
+    "    },\n",
+    "    \"size\": 3,\n",
+    "    \"aggs\" : {\n",
+    "        \"children\": {\n",
+    "            \"terms\": {\"field\": \"record.TotalChildren\"}\n",
+    "        }\n",
+    "    }\n",
+    "}\n",
+    "\n",
+    "es.search(index=index_name, body=aggregation_query)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Clean up\n",
+    "!docker-compose down\n",
+    "project.delete()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/gretel/gc-load_ner_to_elasticsearch/docker-compose.yaml b/gretel/gc-load_ner_to_elasticsearch/docker-compose.yaml
@@ -0,0 +1,39 @@
+version: '3.7'
+
+services:
+
+  # Elasticsearch Docker Images: https://www.docker.elastic.co/
+  elasticsearch:
+    image: docker.elastic.co/elasticsearch/elasticsearch:7.9.2
+    container_name: elasticsearch
+    environment:
+      - xpack.security.enabled=false
+      - discovery.type=single-node
+    ulimits:
+      memlock:
+        soft: -1
+        hard: -1
+      nofile:
+        soft: 65536
+        hard: 65536
+    cap_add:
+      - IPC_LOCK
+    volumes:
+      - elasticsearch-data:/usr/share/elasticsearch/data
+    ports:
+      - 9200:9200
+      - 9300:9300
+
+  kibana:
+    container_name: kibana
+    image: docker.elastic.co/kibana/kibana:7.9.2
+    environment:
+      - ELASTICSEARCH_HOSTS=http://elasticsearch:9200
+    ports:
+      - 5601:5601
+    depends_on:
+      - elasticsearch
+
+volumes:
+  elasticsearch-data:
+    driver: local
diff --git a/gretel/gc-load_ner_to_elasticsearch/manifest.json b/gretel/gc-load_ner_to_elasticsearch/manifest.json
@@ -0,0 +1,9 @@
+{
+    "name": "Load NER results to Elasticsearch",
+    "description": "Understand the NER record format and learn how to load records into an Elasticsearch cluster.",
+    "tags": ["python"],
+    "sample_data_key": "bike-customer-orders",
+    "language": "python",
+    "featured": false,
+    "blog_url": null
+}