-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
DEGEN ETL Improvements [V1] #10
base: main
Are you sure you want to change the base?
Changes from 37 commits
18a0634
854cd2f
6bacb79
e1d4a48
3a28809
42d8c6d
fc4c540
2f0298d
00d0f82
acf6122
5db3cd3
77871a3
2adc05b
eb42f62
acb31a8
be93dc5
9ce6504
14bfff6
2b17ec7
42a87a9
c330081
ccdc5d2
022e5c0
52dfa13
f622926
1d52c8e
a769547
5ee0a0b
c52eb78
d03e88c
975dea2
9e76bef
38fb76a
a2fffaf
4c99c0e
a82e57b
b71fb0f
89ee8a7
02e5b87
b59df5f
d181885
35fed03
ebb3dd8
6d6b8df
f90e705
6e28885
4f451f4
e438604
d1c63f1
f62ca04
06cef7e
39c407f
b157061
bf5899e
dbf7f2d
c60b1d4
2f7b41c
ebf5079
94e2eb6
68fb8a3
ce4812a
563197b
5ebe998
0d1be65
d4cea15
a28ff29
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
3.9.6 |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,24 +7,24 @@ | |
from kafka import KafkaProducer | ||
|
||
from blockchainetl.jobs.exporters.converters.composite_item_converter import CompositeItemConverter | ||
|
||
|
||
from ethereumetl.deduplication.redis import RedisConnector | ||
|
||
class KafkaItemExporter: | ||
|
||
def __init__(self, item_type_to_topic_mapping, converters=()): | ||
self.item_type_to_topic_mapping = item_type_to_topic_mapping | ||
self.converter = CompositeItemConverter(converters) | ||
self.redis = RedisConnector() | ||
|
||
self.connection_url = self.get_connection_url() | ||
print(self.connection_url) | ||
self.producer = KafkaProducer( | ||
bootstrap_servers=self.connection_url, | ||
security_protocol='SASL_SSL', | ||
sasl_mechanism='SCRAM-SHA-512', | ||
sasl_plain_username=os.getenv('KAFKA_SCRAM_USERID'), | ||
sasl_plain_password=os.getenv('KAFKA_SCRAM_PASSWORD'), | ||
client_id=socket.gethostname(), | ||
compression_type='lz4', | ||
compression_type=os.environ.get('KAFKA_COMPRESSION', 'lz4'), | ||
request_timeout_ms= 60000, | ||
max_block_ms= 120000, | ||
buffer_memory= 100000000) | ||
|
@@ -38,16 +38,24 @@ def get_connection_url(self): | |
def open(self): | ||
pass | ||
|
||
def export_items(self, items): | ||
def export_items(self, items): | ||
for item in items: | ||
self.export_item(item) | ||
|
||
def export_item(self, item): | ||
item_type = item.get('type') | ||
if item_type is not None and item_type in self.item_type_to_topic_mapping: | ||
item_id = item.get('id') | ||
|
||
if item_id is not None and item_type is not None and item_type in self.item_type_to_topic_mapping: | ||
item_type = self.item_type_to_topic_mapping[item_type] | ||
data = json.dumps(item).encode('utf-8') | ||
# logging.debug(data) | ||
return self.producer.send(self.item_type_to_topic_mapping[item_type], value=data) | ||
|
||
if not self.already_processed(item_type, item_id): | ||
logging.info(f'Processing message of Type=[{item_type}]; Id=[{item_id}]') | ||
self.mark_processed(item_type, item_id) | ||
return self.producer.send(item_type, value=data) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add error handling, write to logs that. Item has marked as processed but couldn't be produced. |
||
|
||
logging.info(f'Message was already processed skipping... Type=[{item_type}]; Id=[{item_id}]') | ||
else: | ||
logging.warning('Topic for item type "{}" is not configured.'.format(item_type)) | ||
|
||
|
@@ -56,9 +64,17 @@ def convert_items(self, items): | |
yield self.converter.convert_item(item) | ||
|
||
def close(self): | ||
self.redis.close() | ||
pass | ||
|
||
# utility function to set message as processed in Redis | ||
def mark_processed(self, item_type, item_id): | ||
return self.redis.add_to_set(item_type, item_id) | ||
|
||
# utility functions to check message was already processed or not | ||
def already_processed(self, item_type, item_id): | ||
return self.redis.exists_in_set(item_type, item_id) | ||
|
||
def group_by_item_type(items): | ||
result = collections.defaultdict(list) | ||
for item in items: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
import os | ||
import clickhouse_connect | ||
import logging | ||
|
||
class Clickhouse: | ||
""" | ||
Clickhouse Connector | ||
""" | ||
|
||
def __init__(self): | ||
""" | ||
Connect to database and provide it's client | ||
:param host: | ||
:param port: | ||
:param username: | ||
:param password: | ||
:param database: | ||
""" | ||
logging.debug('Connecting to clickhouse !!') | ||
|
||
self._host = os.environ['CLICKHOUSE_HOST'] | ||
self._port = os.environ['CLICKHOUSE_PORT'] | ||
self._username = os.environ['CLICKHOUSE_USERNAME'] | ||
self._password = os.environ['CLICKHOUSE_PASSWORD'] | ||
self._database = os.environ['CLICKHOUSE_DATABASE'] | ||
|
||
logging.debug( | ||
f'Making Connection to DB with host: {self._host}, port: {self._port}, database: {self._database}' | ||
) | ||
self._connection = clickhouse_connect.get_client(host=self._host, port=self._port, secure=True, | ||
username=self._username, password=self._password, database=self._database) | ||
|
||
|
||
async def run_query(self, query, parameters): | ||
"""Function to run query on clickhouse | ||
|
||
Args: | ||
query (str): query to run | ||
parameters (dict): variable parameters | ||
|
||
Returns: | ||
list: fetched data | ||
""" | ||
logging.debug(f'Running SQL Query {query}') | ||
result = self._connection.query(query=query, parameters=parameters) | ||
return result.result_rows |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
import os | ||
import logging | ||
import asyncio | ||
from datetime import datetime | ||
from ethereumetl.constants import constants | ||
|
||
def deduplicate_records(records, ts_key, db): | ||
ch_fallback_days = int(os.environ.get('CLICKHOUSE_FALLBACK_DAYS', constants.CLICKHOUSE_FALLBACK_DAYS)) | ||
|
||
if records == None or len(records) == 0: | ||
return records | ||
|
||
min_ts = get_minimum_ts(records, ts_key) | ||
if is_ts_older(min_ts, ch_fallback_days): | ||
records = asyncio.run(filter_records(records, min_ts, db)) | ||
return records | ||
|
||
def is_ts_older(ts, days): | ||
difference = datetime.utcnow() - datetime.utcfromtimestamp(ts) | ||
return difference.days > days | ||
|
||
async def filter_records(items, min_ts_epoch, db): | ||
if items == None or len(items) == 0: | ||
return items | ||
|
||
message_type = items[0].get('type') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. All items will be of same type in the list? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes as we are running this deduplicate function for each type of message array |
||
skip_message_types = os.environ.get('CLICKHOUSE_SKIP_FOR_MESSAGE_TYPES') | ||
|
||
if skip_message_types != None and (message_type in skip_message_types.split(',')): | ||
logging.info(f'Ignoring check for deduplication for type {message_type} as it is ignored') | ||
return items | ||
|
||
table_name = get_table_name(message_type) | ||
ts_column_name = get_table_ts_column_name(message_type) | ||
if table_name == None: | ||
logging.warn(f'Ignoring check for deduplication for type {message_type} as table not found') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It should be panic instead of warning. service should stop execution. |
||
return items | ||
|
||
min_ts = datetime.utcfromtimestamp(min_ts_epoch).strftime('%Y-%m-%d') | ||
|
||
# extract all ids | ||
ids = list([obj["id"] for obj in items]) | ||
ids_from_db = [] | ||
|
||
parameters = { 'table': table_name, 'ids': [], 'timestamp_key': ts_column_name, 'block_timestamp': min_ts } | ||
query = '''SELECT id FROM {table:Identifier} WHERE id IN {ids:Array(String)} and {timestamp_key:Identifier} >= {block_timestamp:String}''' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why >= & not = only? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also is there any need of batching? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also is there any need of batching? yes, I have observed that in one SQL query if I add more than 1500 elements in |
||
|
||
chunk_size = int(os.environ.get('CLICKHOUSE_QUERY_CHUNK_SIZE', constants.CLICKHOUSE_QUERY_CHUNK_SIZE)) | ||
for i in range(0, len(ids), chunk_size): | ||
chunk = ids[i:i + chunk_size] | ||
parameters['ids'] = chunk | ||
|
||
db_results = await db.run_query(query, parameters) | ||
ids_from_db = ids_from_db + [t[0] for t in db_results] | ||
|
||
return [item for item in items if item['id'] not in ids_from_db] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this efficient in python to do array search. IMO better approach is to implement map to check. |
||
|
||
def get_table_name(message_type): | ||
return constants.ENTITY_TO_TABLE_MAP.get(message_type) | ||
|
||
def get_table_ts_column_name(message_type): | ||
return constants.ENTITY_TO_TABLE_TS_COLUMNS_MAP.get(message_type) | ||
|
||
def get_minimum_ts(items, key): | ||
# get timestamp of oldest message from items list | ||
record = min(items, key=lambda x: x.get(key, float('inf'))) | ||
return record.get(key) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
import os | ||
import hashlib | ||
import redis | ||
from ethereumetl.constants import constants | ||
|
||
class RedisConnector: | ||
def __init__(self): | ||
self.ttl = os.environ['REDIS_MESSAGE_TTL'] | ||
|
||
redis_host = os.environ['REDIS_HOST'] | ||
redis_port = os.environ['REDIS_PORT'] | ||
redis_database = os.environ['REDIS_DB'] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use chainId for REDIS_DB ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sure but lets do it for next chains as for current DEGEN we have already provided its value so if we change in between then it might not be able to do dedup based on keys stored in different database Also, this is configurable from envs so should be easily doable |
||
|
||
self.redis_client = redis.StrictRedis(host=redis_host, port=redis_port, db=redis_database) | ||
|
||
def exists_in_set(self, key, value): | ||
return self.redis_client.exists(self.create_key(key, value)) | ||
|
||
def add_to_set(self, key, value): | ||
return self.redis_client.setex(self.create_key(key, value), self.ttl, '1') | ||
|
||
def create_key(self, key, value): | ||
hashed_data = hashlib.sha1(f"{key}_{value}".encode()).hexdigest() | ||
return f"{constants.REDIS_PREFIX}_{hashed_data}" | ||
|
||
def close(self): | ||
self.redis_client.close() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There should a feature flag to turn on & off deduplicate logic.