Add bulk, count, clear scroll, close PIT examples (#3510)

elastic · Jan 15, 2025 · 4eeb458 · 4eeb458
1 parent 35f9011
commit 4eeb458
Show file tree

Hide file tree

Showing 25 changed files with 687 additions and 271 deletions.
diff --git a/output/openapi/elasticsearch-openapi.json b/output/openapi/elasticsearch-openapi.json
diff --git a/output/openapi/elasticsearch-serverless-openapi.json b/output/openapi/elasticsearch-serverless-openapi.json
diff --git a/output/schema/schema.json b/output/schema/schema.json
diff --git a/specification/_global/bulk/BulkRequest.ts b/specification/_global/bulk/BulkRequest.ts
@@ -31,8 +31,115 @@ import { OperationContainer, UpdateAction } from './types'
 
 /**
  * Bulk index or delete documents.
- * Performs multiple indexing or delete operations in a single API call.
+ * Perform multiple `index`, `create`, `delete`, and `update` actions in a single request.
  * This reduces overhead and can greatly increase indexing speed.
+ *
+ * If the Elasticsearch security features are enabled, you must have the following index privileges for the target data stream, index, or index alias:
+ *
+ * * To use the `create` action, you must have the `create_doc`, `create`, `index`, or `write` index privilege. Data streams support only the `create` action.
+ * * To use the `index` action, you must have the `create`, `index`, or `write` index privilege.
+ * * To use the `delete` action, you must have the `delete` or `write` index privilege.
+ * * To use the `update` action, you must have the `index` or `write` index privilege.
+ * * To automatically create a data stream or index with a bulk API request, you must have the `auto_configure`, `create_index`, or `manage` index privilege.
+ * * To make the result of a bulk operation visible to search using the `refresh` parameter, you must have the `maintenance` or `manage` index privilege.
+ *
+ * Automatic data stream creation requires a matching index template with data stream enabled.
+ *
+ * The actions are specified in the request body using a newline delimited JSON (NDJSON) structure:
+ *
+ * ```
+ * action_and_meta_data\n
+ * optional_source\n
+ * action_and_meta_data\n
+ * optional_source\n
+ * ....
+ * action_and_meta_data\n
+ * optional_source\n
+ * ```
+ *
+ * The `index` and `create` actions expect a source on the next line and have the same semantics as the `op_type` parameter in the standard index API.
+ * A `create` action fails if a document with the same ID already exists in the target
+ * An `index` action adds or replaces a document as necessary.
+ *
+ * NOTE: Data streams support only the `create` action.
+ * To update or delete a document in a data stream, you must target the backing index containing the document.
+ *
+ * An `update` action expects that the partial doc, upsert, and script and its options are specified on the next line.
+ *
+ * A `delete` action does not expect a source on the next line and has the same semantics as the standard delete API.
+ *
+ * NOTE: The final line of data must end with a newline character (`\n`).
+ * Each newline character may be preceded by a carriage return (`\r`).
+ * When sending NDJSON data to the `_bulk` endpoint, use a `Content-Type` header of `application/json` or `application/x-ndjson`.
+ * Because this format uses literal newline characters (`\n`) as delimiters, make sure that the JSON actions and sources are not pretty printed.
+ *
+ * If you provide a target in the request path, it is used for any actions that don't explicitly specify an `_index` argument.
+ *
+ * A note on the format: the idea here is to make processing as fast as possible.
+ * As some of the actions are redirected to other shards on other nodes, only `action_meta_data` is parsed on the receiving node side.
+ *
+ * Client libraries using this protocol should try and strive to do something similar on the client side, and reduce buffering as much as possible.
+ *
+ * There is no "correct" number of actions to perform in a single bulk request.
+ * Experiment with different settings to find the optimal size for your particular workload.
+ * Note that Elasticsearch limits the maximum size of a HTTP request to 100mb by default so clients must ensure that no request exceeds this size.
+ * It is not possible to index a single document that exceeds the size limit, so you must pre-process any such documents into smaller pieces before sending them to Elasticsearch.
+ * For instance, split documents into pages or chapters before indexing them, or store raw binary data in a system outside Elasticsearch and replace the raw data with a link to the external system in the documents that you send to Elasticsearch.
+ *
+ * **Client suppport for bulk requests**
+ *
+ * Some of the officially supported clients provide helpers to assist with bulk requests and reindexing:
+ *
+ * * Go: Check out `esutil.BulkIndexer`
+ * * Perl: Check out `Search::Elasticsearch::Client::5_0::Bulk` and `Search::Elasticsearch::Client::5_0::Scroll`
+ * * Python: Check out `elasticsearch.helpers.*`
+ * * JavaScript: Check out `client.helpers.*`
+ * * .NET: Check out `BulkAllObservable`
+ * * PHP: Check out bulk indexing.
+ *
+ * **Submitting bulk requests with cURL**
+ *
+ * If you're providing text file input to `curl`, you must use the `--data-binary` flag instead of plain `-d`.
+ * The latter doesn't preserve newlines. For example:
+ *
+ * ```
+ * $ cat requests
+ * { "index" : { "_index" : "test", "_id" : "1" } }
+ * { "field1" : "value1" }
+ * $ curl -s -H "Content-Type: application/x-ndjson" -XPOST localhost:9200/_bulk --data-binary "@requests"; echo
+ * {"took":7, "errors": false, "items":[{"index":{"_index":"test","_id":"1","_version":1,"result":"created","forced_refresh":false}}]}
+ * ```
+ *
+ * **Optimistic concurrency control**
+ *
+ * Each `index` and `delete` action within a bulk API call may include the `if_seq_no` and `if_primary_term` parameters in their respective action and meta data lines.
+ * The `if_seq_no` and `if_primary_term` parameters control how operations are run, based on the last modification to existing documents. See Optimistic concurrency control for more details.
+ *
+ * **Versioning**
+ *
+ * Each bulk item can include the version value using the `version` field.
+ * It automatically follows the behavior of the index or delete operation based on the `_version` mapping.
+ * It also support the `version_type`.
+ *
+ * **Routing**
+ *
+ * Each bulk item can include the routing value using the `routing` field.
+ * It automatically follows the behavior of the index or delete operation based on the `_routing` mapping.
+ *
+ * NOTE: Data streams do not support custom routing unless they were created with the `allow_custom_routing` setting enabled in the template.
+ *
+ * **Wait for active shards**
+ *
+ * When making bulk calls, you can set the `wait_for_active_shards` parameter to require a minimum number of shard copies to be active before starting to process the bulk request.
+ *
+ * **Refresh**
+ *
+ * Control when the changes made by this request are visible to search.
+ *
+ * NOTE: Only the shards that receive the bulk request will be affected by refresh.
+ * Imagine a `_bulk?refresh=wait_for` request with three documents in it that happen to be routed to different shards in an index with five shards.
+ * The request will only wait for those three shards to refresh.
+ * The other two shards that make up the index do not participate in the `_bulk` request at all.
  * @rest_spec_name bulk
  * @availability stack stability=stable
  * @availability serverless stability=stable visibility=public
@@ -53,62 +160,72 @@ export interface Request<TDocument, TPartialDocument> extends RequestBase {
   ]
   path_parts: {
     /**
-     * Name of the data stream, index, or index alias to perform bulk actions on.
+     * The name of the data stream, index, or index alias to perform bulk actions on.
      */
     index?: IndexName
   }
   query_parameters: {
     /**
-     * If `true`, the response will include the ingest pipelines that were executed for each index or create.
+     * If `true`, the response will include the ingest pipelines that were run for each index or create.
      * @server_default false
      */
     list_executed_pipelines?: boolean
     /**
-     * ID of the pipeline to use to preprocess incoming documents.
-     * If the index has a default ingest pipeline specified, then setting the value to `_none` disables the default ingest pipeline for this request.
-     * If a final pipeline is configured it will always run, regardless of the value of this parameter.
+     * The pipeline identifier to use to preprocess incoming documents.
+     * If the index has a default ingest pipeline specified, setting the value to `_none` turns off the default ingest pipeline for this request.
+     * If a final pipeline is configured, it will always run regardless of the value of this parameter.
      */
     pipeline?: string
     /**
-     * If `true`, Elasticsearch refreshes the affected shards to make this operation visible to search, if `wait_for` then wait for a refresh to make this operation visible to search, if `false` do nothing with refreshes.
+     * If `true`, Elasticsearch refreshes the affected shards to make this operation visible to search.
+     * If `wait_for`, wait for a refresh to make this operation visible to search.
+     * If `false`, do nothing with refreshes.
      * Valid values: `true`, `false`, `wait_for`.
      * @server_default false
      */
     refresh?: Refresh
     /**
-     * Custom value used to route operations to a specific shard.
+     * A custom value that is used to route operations to a specific shard.
      */
     routing?: Routing
     /**
-     * `true` or `false` to return the `_source` field or not, or a list of fields to return.
+     * Indicates whether to return the `_source` field (`true` or `false`) or contains a list of fields to return.
      */
     _source?: SourceConfigParam
     /**
      * A comma-separated list of source fields to exclude from the response.
+     * You can also use this parameter to exclude fields from the subset specified in `_source_includes` query parameter.
+     * If the `_source` parameter is `false`, this parameter is ignored.
      */
     _source_excludes?: Fields
     /**
      * A comma-separated list of source fields to include in the response.
+     * If this parameter is specified, only these source fields are returned.
+     * You can exclude fields from this subset using the `_source_excludes` query parameter.
+     * If the `_source` parameter is `false`, this parameter is ignored.
      */
     _source_includes?: Fields
     /**
-     * Period each action waits for the following operations: automatic index creation, dynamic mapping updates, waiting for active shards.
+     * The period each action waits for the following operations: automatic index creation, dynamic mapping updates, and waiting for active shards.
+     * The default is `1m` (one minute), which guarantees Elasticsearch waits for at least the timeout before failing.
+     * The actual wait time could be longer, particularly when multiple waits occur.
      * @server_default 1m
      */
     timeout?: Duration
     /**
      * The number of shard copies that must be active before proceeding with the operation.
-     * Set to all or any positive integer up to the total number of shards in the index (`number_of_replicas+1`).
+     * Set to `all` or any positive integer up to the total number of shards in the index (`number_of_replicas+1`).
+     * The default is `1`, which waits for each primary shard to be active.
      * @server_default 1
      */
     wait_for_active_shards?: WaitForActiveShards
     /**
-     * If `true`, the request’s actions must target an index alias.
+     * If `true`, the request's actions must target an index alias.
      * @server_default false
      */
     require_alias?: boolean
     /**
-     * If `true`, the request's actions must target a data stream (existing or to-be-created).
+     * If `true`, the request's actions must target a data stream (existing or to be created).
      * @server_default false
      */
     require_data_stream?: boolean

diff --git a/specification/_global/bulk/BulkResponse.ts b/specification/_global/bulk/BulkResponse.ts
@@ -22,9 +22,23 @@ import { long } from '@_types/Numeric'
 import { OperationType, ResponseItem } from './types'
 
 export class Response {
+  /**
+   * The response contains the individual results of each operation in the request.
+   * They are returned in the order submitted.
+   * The success or failure of an individual operation does not affect other operations in the request.
+   */
   body: {
+    /**
+     * If `true`, one or more of the operations in the bulk request did not complete successfully.
+     */
     errors: boolean
+    /**
+     * The result of each operation in the bulk request, in the order they were submitted.
+     */
     items: SingleKeyDictionary<OperationType, ResponseItem>[]
+    /**
+     * The length of time, in milliseconds, it took to process the bulk request.
+     */
     took: long
     ingest_took?: long
   }

diff --git a/specification/_global/bulk/examples/request/BulkRequestExample1.yaml b/specification/_global/bulk/examples/request/BulkRequestExample1.yaml
@@ -0,0 +1,17 @@
+summary: Multiple operations
+# method_request: POST _bulk
+description: Run `POST _bulk` to perform multiple operations.
+# type: request
+value: '{ "index" : { "_index" : "test", "_id" : "1" } }
+
+  { "field1" : "value1" }
+
+  { "delete" : { "_index" : "test", "_id" : "2" } }
+
+  { "create" : { "_index" : "test", "_id" : "3" } }
+
+  { "field1" : "value3" }
+
+  { "update" : {"_id" : "1", "_index" : "test"} }
+
+  { "doc" : {"field2" : "value2"} }'
diff --git a/specification/_global/bulk/examples/request/BulkRequestExample2.yaml b/specification/_global/bulk/examples/request/BulkRequestExample2.yaml
@@ -0,0 +1,26 @@
+summary: Bulk updates
+# method_request: POST _bulk
+description: >
+  When you run `POST _bulk` and use the `update` action, you can use `retry_on_conflict` as a field in the action itself (not in the extra payload line) to specify how many times an update should be retried in the case of a version conflict.
+# type: request
+value:
+  '{ "update" : {"_id" : "1", "_index" : "index1", "retry_on_conflict" : 3} }
+
+  { "doc" : {"field" : "value"} }
+
+  { "update" : { "_id" : "0", "_index" : "index1", "retry_on_conflict" : 3} }
+
+  { "script" : { "source": "ctx._source.counter += params.param1", "lang" : "painless",
+  "params" : {"param1" : 1}}, "upsert" : {"counter" : 1}}
+
+  { "update" : {"_id" : "2", "_index" : "index1", "retry_on_conflict" : 3} }
+
+  { "doc" : {"field" : "value"}, "doc_as_upsert" : true }
+
+  { "update" : {"_id" : "3", "_index" : "index1", "_source" : true} }
+
+  { "doc" : {"field" : "value"} }
+
+  { "update" : {"_id" : "4", "_index" : "index1"} }
+
+  { "doc" : {"field" : "value"}, "_source": true}'
diff --git a/specification/_global/bulk/examples/request/BulkRequestExample3.yaml b/specification/_global/bulk/examples/request/BulkRequestExample3.yaml
@@ -0,0 +1,16 @@
+summary: Filter for failed operations
+# method_request: POST /_bulk
+description: >
+  To return only information about failed operations, run `POST /_bulk?filter_path=items.*.error`.
+# type: request
+value: '{ "update": {"_id": "5", "_index": "index1"} }
+
+  { "doc": {"my_field": "foo"} }
+
+  { "update": {"_id": "6", "_index": "index1"} }
+
+  { "doc": {"my_field": "foo"} }
+
+  { "create": {"_id": "7", "_index": "index1"} }
+
+  { "my_field": "foo" }'
diff --git a/specification/_global/bulk/examples/request/BulkRequestExample4.yaml b/specification/_global/bulk/examples/request/BulkRequestExample4.yaml
@@ -0,0 +1,13 @@
+summary: Dynamic templates
+method_request: POST /_bulk
+description: >
+  Run `POST /_bulk` to perform a bulk request that consists of index and create actions with the `dynamic_templates` parameter.
+  The bulk request creates two new fields `work_location` and `home_location` with type `geo_point` according to the `dynamic_templates` parameter.
+  However, the `raw_location` field is created using default dynamic mapping rules, as a text field in that case since it is supplied as a string in the JSON document.
+# type: request
+value: "{ \"index\" : {\
+  \ \"_index\" : \"my_index\", \"_id\" : \"1\", \"dynamic_templates\": {\"work_location\"\
+  : \"geo_point\"}} }\n{ \"field\" : \"value1\", \"work_location\": \"41.12,-71.34\"\
+  , \"raw_location\": \"41.12,-71.34\"}\n{ \"create\" : { \"_index\" : \"my_index\"\
+  , \"_id\" : \"2\", \"dynamic_templates\": {\"home_location\": \"geo_point\"}} }\n\
+  { \"field\" : \"value2\", \"home_location\": \"41.12,-71.34\"}"
diff --git a/specification/_global/bulk/examples/response/BulkResponseExample1.yaml b/specification/_global/bulk/examples/response/BulkResponseExample1.yaml
@@ -0,0 +1,27 @@
+summary: Multiple successful operations
+# description: ''
+# type: response
+# response_code: ''
+value:
+  "{\n   \"took\": 30,\n   \"errors\": false,\n   \"items\": [\n      {\n   \
+  \      \"index\": {\n            \"_index\": \"test\",\n            \"_id\": \"\
+  1\",\n            \"_version\": 1,\n            \"result\": \"created\",\n     \
+  \       \"_shards\": {\n               \"total\": 2,\n               \"successful\"\
+  : 1,\n               \"failed\": 0\n            },\n            \"status\": 201,\n\
+  \            \"_seq_no\" : 0,\n            \"_primary_term\": 1\n         }\n  \
+  \    },\n      {\n         \"delete\": {\n            \"_index\": \"test\",\n  \
+  \          \"_id\": \"2\",\n            \"_version\": 1,\n            \"result\"\
+  : \"not_found\",\n            \"_shards\": {\n               \"total\": 2,\n   \
+  \            \"successful\": 1,\n               \"failed\": 0\n            },\n\
+  \            \"status\": 404,\n            \"_seq_no\" : 1,\n            \"_primary_term\"\
+  \ : 2\n         }\n      },\n      {\n         \"create\": {\n            \"_index\"\
+  : \"test\",\n            \"_id\": \"3\",\n            \"_version\": 1,\n       \
+  \     \"result\": \"created\",\n            \"_shards\": {\n               \"total\"\
+  : 2,\n               \"successful\": 1,\n               \"failed\": 0\n        \
+  \    },\n            \"status\": 201,\n            \"_seq_no\" : 2,\n          \
+  \  \"_primary_term\" : 3\n         }\n      },\n      {\n         \"update\": {\n\
+  \            \"_index\": \"test\",\n            \"_id\": \"1\",\n            \"\
+  _version\": 2,\n            \"result\": \"updated\",\n            \"_shards\": {\n\
+  \                \"total\": 2,\n                \"successful\": 1,\n           \
+  \     \"failed\": 0\n            },\n            \"status\": 200,\n            \"\
+  _seq_no\" : 3,\n            \"_primary_term\" : 4\n         }\n      }\n   ]\n}"
diff --git a/specification/_global/bulk/examples/response/BulkResponseExample2.yaml b/specification/_global/bulk/examples/response/BulkResponseExample2.yaml
@@ -0,0 +1,24 @@
+summary: Failed actions
+description: >
+  If you run `POST /_bulk` with operations that update non-existent documents, the operations cannot complete successfully.
+  The API returns a response with an `errors` property value `true`.
+  The response also includes an error object for any failed operations.
+  The error object contains additional information about the failure, such as the error type and reason.
+# type: response
+# response_code: ''
+value:
+  "{\n  \"took\": 486,\n  \"errors\": true,\n  \"items\": [\n    {\n      \"\
+  update\": {\n        \"_index\": \"index1\",\n        \"_id\": \"5\",\n        \"\
+  status\": 404,\n        \"error\": {\n          \"type\": \"document_missing_exception\"\
+  ,\n          \"reason\": \"[5]: document missing\",\n          \"index_uuid\": \"\
+  aAsFqTI0Tc2W0LCWgPNrOA\",\n          \"shard\": \"0\",\n          \"index\": \"\
+  index1\"\n        }\n      }\n    },\n    {\n      \"update\": {\n        \"_index\"\
+  : \"index1\",\n        \"_id\": \"6\",\n        \"status\": 404,\n        \"error\"\
+  : {\n          \"type\": \"document_missing_exception\",\n          \"reason\":\
+  \ \"[6]: document missing\",\n          \"index_uuid\": \"aAsFqTI0Tc2W0LCWgPNrOA\"\
+  ,\n          \"shard\": \"0\",\n          \"index\": \"index1\"\n        }\n   \
+  \   }\n    },\n    {\n      \"create\": {\n        \"_index\": \"index1\",\n   \
+  \     \"_id\": \"7\",\n        \"_version\": 1,\n        \"result\": \"created\"\
+  ,\n        \"_shards\": {\n          \"total\": 2,\n          \"successful\": 1,\n\
+  \          \"failed\": 0\n        },\n        \"_seq_no\": 0,\n        \"_primary_term\"\
+  : 1,\n        \"status\": 201\n      }\n    }\n  ]\n}"
diff --git a/specification/_global/bulk/examples/response/BulkResponseExample3.yaml b/specification/_global/bulk/examples/response/BulkResponseExample3.yaml
@@ -0,0 +1,14 @@
+summary: Filter for failed operations
+description: >
+  An example response from `POST /_bulk?filter_path=items.*.error`, which returns only information about failed operations.
+# type: response
+# response_code: ''
+value:
+  "{\n  \"items\": [\n    {\n      \"update\": {\n        \"error\": {\n    \
+  \      \"type\": \"document_missing_exception\",\n          \"reason\": \"[5]: document\
+  \ missing\",\n          \"index_uuid\": \"aAsFqTI0Tc2W0LCWgPNrOA\",\n          \"\
+  shard\": \"0\",\n          \"index\": \"index1\"\n        }\n      }\n    },\n \
+  \   {\n      \"update\": {\n        \"error\": {\n          \"type\": \"document_missing_exception\"\
+  ,\n          \"reason\": \"[6]: document missing\",\n          \"index_uuid\": \"\
+  aAsFqTI0Tc2W0LCWgPNrOA\",\n          \"shard\": \"0\",\n          \"index\": \"\
+  index1\"\n        }\n      }\n    }\n  ]\n}"