-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #19 from bytewax/bytewax-v0.16
Bytewax to 0.16.1 and allowing disable api
- Loading branch information
Showing
17 changed files
with
194 additions
and
171 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,25 +1,29 @@ | ||
from bytewax.dataflow import Dataflow | ||
from bytewax.execution import cluster_main | ||
from bytewax.inputs import ManualInputConfig | ||
from bytewax.outputs import ManualOutputConfig | ||
from bytewax import parse | ||
from bytewax.connectors.stdio import StdOutput | ||
from bytewax.inputs import StatelessSource, DynamicInput | ||
import time | ||
|
||
def input_builder(worker_index, worker_count, resume_epoch): | ||
# Ignore state recovery here | ||
state = None | ||
for i in range(100): | ||
class NumberSource(StatelessSource): | ||
def __init__(self, max, worker_index): | ||
self.worker_index = worker_index | ||
self.iterator = iter(range(max)) | ||
|
||
def next(self): | ||
time.sleep(1) | ||
yield state, i | ||
return f"Worker: {self.worker_index} - {next(self.iterator)}" | ||
|
||
def output_builder(worker_index, worker_count): | ||
def output_handler(item): | ||
print(f"worker: {worker_index} - item: {item}") | ||
return output_handler | ||
def close(self): | ||
pass | ||
|
||
flow = Dataflow() | ||
flow.input("inp", ManualInputConfig(input_builder)) | ||
flow.capture(ManualOutputConfig(output_builder)) | ||
|
||
if __name__ == "__main__": | ||
cluster_main(flow, **parse.proc_env()) | ||
class NumberInput(DynamicInput): | ||
def __init__(self, max): | ||
self.max = max | ||
|
||
def build(self, worker_index, worker_count): | ||
return NumberSource(self.max, worker_index) | ||
|
||
|
||
flow = Dataflow() | ||
flow.input("inp", NumberInput(100)) | ||
flow.output("out", StdOutput()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,94 +1,47 @@ | ||
import os | ||
from pathlib import Path | ||
|
||
from bytewax import parse | ||
from bytewax.dataflow import Dataflow | ||
from bytewax.execution import cluster_main | ||
from bytewax.inputs import distribute, ManualInputConfig | ||
from bytewax.outputs import ManualOutputConfig | ||
from bytewax.recovery import SqliteRecoveryConfig | ||
from bytewax.connectors.stdio import StdOutput | ||
from bytewax.connectors.files import DirInput, DirOutput, FileInput, FileOutput | ||
|
||
input_dir = Path("./examples/sample_data/cluster/") | ||
input_dir = Path("./sample_data/cluster/") | ||
output_dir = Path("./cluster_out/") | ||
recovery_dir = Path("./cluster_recovery/") | ||
|
||
# to see more on recovery with this example, see "examples/manual_cluster.py" | ||
recovery_dir.mkdir(exist_ok=True) | ||
recovery_config = SqliteRecoveryConfig(recovery_dir) | ||
|
||
|
||
def input_builder(worker_index, worker_count, resume_state): | ||
print(f"Worker {worker_index} resuming with state: {resume_state}") | ||
# Fill in a default resume state if we have None. The resume state | ||
# will be a dict from path to line number to start reading at. | ||
state = resume_state or {} | ||
# List all the input partitions in the reading directory. | ||
all_partitions = input_dir.glob("*.txt") | ||
# Then have this worker only read every `n` files so each worker | ||
# will read a disjoint set. | ||
this_worker_partitions = distribute(all_partitions, worker_index, worker_count) | ||
# Open all the ones that this worker should read. | ||
for path in this_worker_partitions: | ||
with open(path) as f: | ||
for i, line in enumerate(f): | ||
# If we're resuming, skip ahead to the line for this | ||
# file in the state. | ||
if i < state.get(path, 0): | ||
continue | ||
# Since the file has just read the current line as | ||
# part of the for loop, note that on resume we should | ||
# start reading from the next line. | ||
state[path] = i + 1 | ||
# Now send them into the dataflow on this worker. | ||
yield state, line.strip() | ||
print(f"Worker {worker_index} input state: {state}") | ||
|
||
|
||
def output_builder(worker_index, worker_count): | ||
output_dir.mkdir(exist_ok=True) | ||
# Open a file that just this worker will write to. | ||
write_to = open(output_dir / f"worker{worker_index}.out", "a") | ||
# Build a function that can be called for each captured output. | ||
def write(item): | ||
write_to.write(f"{item}\n") | ||
|
||
# Return it so Bytewax will run it whenever an item is seen by a | ||
# capture operator. | ||
return write | ||
|
||
def to_tuple(x): | ||
return tuple(map(str, x.split(','))) | ||
|
||
flow = Dataflow() | ||
flow.input("inp", ManualInputConfig(input_builder)) | ||
flow.input("inp", DirInput(input_dir)) | ||
flow.map(str.upper) | ||
flow.capture(ManualOutputConfig(output_builder)) | ||
|
||
flow.map(to_tuple) | ||
flow.output("out", DirOutput(output_dir, 5, assign_file=int)) | ||
|
||
|
||
if __name__ == "__main__": | ||
# We are going to use Waxctl, you can download it from https://bytewax.io/downloads | ||
# Run these commands in your terminal to run a cluster of two containers: | ||
# We are going to use Waxctl, you can download it from https://bytewax.io/downloads | ||
# Run these commands in your terminal to run a cluster of two containers: | ||
|
||
# $ tar -C ./ -cvf cluster.tar examples | ||
# $ waxctl dataflow deploy ./cluster.tar --name k8s-cluster --python-file-name examples/k8s_cluster.py -p2 | ||
# $ tar -C ./ -cvf cluster.tar examples | ||
# $ waxctl dataflow deploy ./cluster.tar --name k8s-cluster --python-file-name examples/k8s_cluster.py -p2 | ||
|
||
# Each worker will read the files in | ||
# ./examples/sample_data/cluster/*.txt which have lines like | ||
# `one1`. | ||
# Each worker will read the files in | ||
# ./examples/sample_data/cluster/*.txt which have lines like | ||
# `one1`. | ||
|
||
# They will then both finish and you'll see ./cluster_out/0.out | ||
# and ./cluster_out/1.out with the data that each process in the | ||
# cluster wrote with the lines uppercased. | ||
# They will then both finish and you'll see ./cluster_out/part_0 | ||
# and ./cluster_out/part_1 with the data that each process in the | ||
# cluster wrote with the lines uppercased. | ||
|
||
# To see that files in each container you can run these commands: | ||
# To see that files in each container you can run these commands: | ||
|
||
# kubectl exec -it k8s-cluster-0 -cprocess -- cat /var/bytewax/cluster_out/0.out | ||
# kubectl exec -it k8s-cluster-1 -cprocess -- cat /var/bytewax/cluster_out/1.out | ||
# kubectl exec -it k8s-cluster-0 -cprocess -- cat /var/bytewax/cluster_out/part_0.out | ||
# kubectl exec -it k8s-cluster-1 -cprocess -- cat /var/bytewax/cluster_out/part_1.out | ||
|
||
# You could imagine reading from / writing to separate Kafka | ||
# partitions, S3 blobs, etc. | ||
# You could imagine reading from / writing to separate Kafka | ||
# partitions, S3 blobs, etc. | ||
|
||
# When using `cluster_main()` you have to coordinate ensuring each | ||
# process knows the address of all other processes in the cluster | ||
# and their unique process ID. You can address that easily by deploying your | ||
# dataflow program using Waxctl or installing the Bytewax Helm Chart | ||
cluster_main(flow, recovery_config=recovery_config, **parse.proc_env()) | ||
# When using `cluster_main()` you have to coordinate ensuring each | ||
# process knows the address of all other processes in the cluster | ||
# and their unique process ID. You can address that easily by deploying your | ||
# dataflow program using Waxctl or installing the Bytewax Helm Chart | ||
# cluster_main(flow, recovery_config=recovery_config, **parse.proc_env()) |
12 changes: 6 additions & 6 deletions
12
charts/bytewax/examples/sample_data/cluster/partition-1.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
one1 | ||
one2 | ||
one3 | ||
one4 | ||
one5 | ||
one6 | ||
1,one1 | ||
2,one2 | ||
3,one3 | ||
4,one4 | ||
5,one5 | ||
6,one6 |
12 changes: 6 additions & 6 deletions
12
charts/bytewax/examples/sample_data/cluster/partition-2.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
two1 | ||
two2 | ||
two3 | ||
two4 | ||
two5 | ||
two6 | ||
1,two1 | ||
2,two2 | ||
3,two3 | ||
4,two4 | ||
5,two5 | ||
6,two6 |
12 changes: 6 additions & 6 deletions
12
charts/bytewax/examples/sample_data/cluster/partition-3.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
three1 | ||
three2 | ||
three3 | ||
three4 | ||
three5 | ||
three6 | ||
1,hree1 | ||
2,three2 | ||
3,three3 | ||
4,three4 | ||
5,three5 | ||
6,three6 |
12 changes: 6 additions & 6 deletions
12
charts/bytewax/examples/sample_data/cluster/partition-4.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
four1 | ||
four2 | ||
four3 | ||
four4 | ||
four5 | ||
four6 | ||
1,four1 | ||
2,four2 | ||
3,four3 | ||
4,four4 | ||
5,four5 | ||
6,four6 |
12 changes: 6 additions & 6 deletions
12
charts/bytewax/examples/sample_data/cluster/partition-5.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
five1 | ||
five2 | ||
five3 | ||
five4 | ||
five5 | ||
five6 | ||
1,five1 | ||
2,five2 | ||
3,five3 | ||
4,five4 | ||
5,five5 | ||
6,five6 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.