Airstack-xyz · manjeet9727 · Apr 7, 2024 · Apr 7, 2024 · Apr 7, 2024 · Apr 7, 2024
diff --git a/.python-version b/.python-version
@@ -0,0 +1 @@
+3.9.6
diff --git a/blockchainetl/jobs/exporters/kafka_exporter.py b/blockchainetl/jobs/exporters/kafka_exporter.py
@@ -7,24 +7,24 @@
 from kafka import KafkaProducer
 
 from blockchainetl.jobs.exporters.converters.composite_item_converter import CompositeItemConverter
-
-
+from ethereumetl.deduplication.redis import RedisConnector
 
 class KafkaItemExporter:
 
     def __init__(self, item_type_to_topic_mapping, converters=()):
         self.item_type_to_topic_mapping = item_type_to_topic_mapping
         self.converter = CompositeItemConverter(converters)
+        self.redis = RedisConnector()
+
         self.connection_url = self.get_connection_url()
-        print(self.connection_url)
         self.producer = KafkaProducer(
             bootstrap_servers=self.connection_url,
             security_protocol='SASL_SSL',
             sasl_mechanism='SCRAM-SHA-512',
             sasl_plain_username=os.getenv('KAFKA_SCRAM_USERID'),
             sasl_plain_password=os.getenv('KAFKA_SCRAM_PASSWORD'),
             client_id=socket.gethostname(),
-            compression_type='lz4',
+            compression_type=os.environ.get('KAFKA_COMPRESSION', 'lz4'),
             request_timeout_ms= 60000,
             max_block_ms= 120000,
             buffer_memory= 100000000)
@@ -38,16 +38,24 @@ def get_connection_url(self):
     def open(self):
         pass
 
-    def export_items(self, items):
+    def export_items(self, items):        
         for item in items:
             self.export_item(item)
 
     def export_item(self, item):
         item_type = item.get('type')
-        if item_type is not None and item_type in self.item_type_to_topic_mapping:
+        item_id = item.get('id')
+
+        if item_id is not None and item_type is not None and item_type in self.item_type_to_topic_mapping:
+            item_type = self.item_type_to_topic_mapping[item_type]
             data = json.dumps(item).encode('utf-8')
-            # logging.debug(data)
-            return self.producer.send(self.item_type_to_topic_mapping[item_type], value=data)
+
+            if not self.already_processed(item_type, item_id):
+                logging.info(f'Processing message of Type=[{item_type}]; Id=[{item_id}]')
+                self.mark_processed(item_type, item_id)
+                return self.producer.send(item_type, value=data)
+
+            logging.info(f'Message was already processed skipping...  Type=[{item_type}]; Id=[{item_id}]')
         else:
             logging.warning('Topic for item type "{}" is not configured.'.format(item_type))
 
@@ -56,9 +64,17 @@ def convert_items(self, items):
             yield self.converter.convert_item(item)
 
     def close(self):
+        self.redis.close()
         pass
 
+    # utility function to set message as processed in Redis
+    def mark_processed(self, item_type, item_id):
+        return self.redis.add_to_set(item_type, item_id)
 
+    # utility functions to check message was already processed or not
+    def already_processed(self, item_type, item_id):
+        return self.redis.exists_in_set(item_type, item_id)
+
 def group_by_item_type(items):
     result = collections.defaultdict(list)
     for item in items:

diff --git a/blockchainetl/streaming/streamer.py b/blockchainetl/streaming/streamer.py
@@ -63,8 +63,10 @@ def __init__(
         self.prometheus_client = PrometheusConnector()
 
         if self.mode == constants.RUN_MODE_NORMAL:
-            if self.start_block is not None or not os.path.isfile(self.last_synced_block_file):
-                init_last_synced_block_file((self.start_block or 0) - 1, self.last_synced_block_file)
+            # if "start-block" is provided and "last_synced_block_file" is not present
+            # then write "start-block - 1" to "last_synced_block_file"
+            if (self.start_block is not None) and (not os.path.isfile(self.last_synced_block_file)):
+                write_last_synced_block(self.last_synced_block_file, self.start_block - 1)
 
             self.last_synced_block = read_last_synced_block(self.last_synced_block_file)
 
@@ -162,15 +164,6 @@ def write_last_synced_block(file, last_synced_block):
     write_to_file(file, str(last_synced_block) + '\n')
 
 
-def init_last_synced_block_file(start_block, last_synced_block_file):
-    if os.path.isfile(last_synced_block_file):
-        raise ValueError(
-            '{} should not exist if --start-block option is specified. '
-            'Either remove the {} file or the --start-block option.'
-                .format(last_synced_block_file, last_synced_block_file))
-    write_last_synced_block(last_synced_block_file, start_block)
-
-
 def read_last_synced_block(file):
     with smart_open(file, 'r') as last_synced_block_file:
         return int(last_synced_block_file.read())

diff --git a/ethereumetl/cli/stream.py b/ethereumetl/cli/stream.py
@@ -32,8 +32,10 @@
 from ethereumetl.thread_local_proxy import ThreadLocalProxy
 from ethereumetl.constants import constants
 
+from prometheus_client import start_http_server
+
 @click.command(context_settings=dict(help_option_names=['-h', '--help']))
-@click.option('-l', '--last-synced-block-file', default='last_synced_block.txt', show_default=True, type=str, help='')
+@click.option('-l', '--last-synced-block-file', default='last_synced_block.txt', show_default=True, type=str, help='Stores last-synced-block-number and is used to continue sync in-case of restarts. If both "--last-synced-block-file" and "--start-block" are provided then block-number in "--last-synced-block-file" takes precedence')
 @click.option('--lag', default=0, show_default=True, type=int, help='The number of blocks to lag behind the network.')
 @click.option('-o', '--output', type=str,
               help='Either Google PubSub topic path e.g. projects/your-project/topics/crypto_ethereum; '
@@ -42,7 +44,7 @@
                    'or kafka, output name and connection host:port e.g. kafka/127.0.0.1:9092 '
                    'or Kinesis, e.g. kinesis://your-data-stream-name'
                    'If not specified will print to console')
-@click.option('-s', '--start-block', default=None, show_default=True, type=int, help='Start block')
+@click.option('-s', '--start-block', required=True, type=int, help='Start block')
 @click.option('-E', '--end-block', default=None, show_default=True, type=int, help='End block')
 @click.option('-e', '--entity-types', default=','.join(EntityType.ALL_FOR_INFURA), show_default=True, type=str,
               help='The list of entity types to export.')
@@ -63,16 +65,9 @@ def stream(last_synced_block_file, lag, output, start_block, end_block, entity_t
 
     from ethereumetl.streaming.eth_streamer_adapter import EthStreamerAdapter
     from blockchainetl.streaming.streamer import Streamer
-
-    if os.environ['BLOCKCHAIN'] == None:
-        raise ValueError('BLOCKCHAIN env is missing')
 
+    check_required_envs()
     provider_uri = os.environ['PROVIDER_URI']
-    if provider_uri == None:
-        raise ValueError('PROVIDER_URI env is missing')
-
-    if os.environ['KAFKA_BROKER_URI'] == None:
-        raise ValueError('KAFKA_BROKER_URI env is missing')
 
     if mode == constants.RUN_MODE_CORRECTION:
         blocks_to_reprocess = [int(block) for block in blocks_to_reprocess.split(',')]
@@ -105,9 +100,15 @@ def stream(last_synced_block_file, lag, output, start_block, end_block, entity_t
     streamer.stream()
 
 
+def check_required_envs():
+    for env in constants.REQUIRED_ENVS:
+        if os.environ[env] == None:
+            raise ValueError(f'{env} env is missing')
+
+
 def parse_entity_types(entity_types):
     entity_types = [c.strip() for c in entity_types.split(',')]
-
+    
     # validate passed types
     for entity_type in entity_types:
         if entity_type not in EntityType.ALL_FOR_STREAMING:

diff --git a/ethereumetl/constants/constants.py b/ethereumetl/constants/constants.py
@@ -1,3 +1,4 @@
+from ethereumetl.enumeration.entity_type import EntityType
 
 TOKEN_TYPE_ERC20 = 'ERC20'
 TOKEN_TYPE_ERC721 = 'ERC721'
@@ -42,4 +43,48 @@
 RUN_MODE_CORRECTION = 'correction'
 RUN_MODE_NORMAL= 'normal'
 
-METRICS_PORT = '9000'
+# variables for deduplication
+REDIS_PREFIX = 'etl'
+METRICS_PORT = '9000'
+CLICKHOUSE_FALLBACK_DAYS = '1'
+CLICKHOUSE_QUERY_CHUNK_SIZE = 1500
+
+REQUIRED_ENVS = [
+    # envs for kafka integration
+    'BLOCKCHAIN',
+    'PROVIDER_URI',
+    'KAFKA_BROKER_URI',
+
+    # envs for deduplication support
+    'REDIS_HOST',
+    'REDIS_PORT',
+    'REDIS_DB',
+    'REDIS_MESSAGE_TTL',
+    'CLICKHOUSE_HOST',
+    'CLICKHOUSE_PORT',
+    'CLICKHOUSE_USERNAME',
+    'CLICKHOUSE_PASSWORD',
+    'CLICKHOUSE_DATABASE'
+]
+
+ENTITY_TO_TABLE_MAP = {
+    EntityType.BLOCK: 'blocks',
+    EntityType.TRANSACTION: 'transactions',
+    EntityType.LOG: 'logs',
+    EntityType.TOKEN_TRANSFER: 'token_transfers',
+    EntityType.TRACE: 'traces',
+    EntityType.GETH_TRACE: 'traces',
+    EntityType.CONTRACT: 'contracts',
+    EntityType.TOKEN: 'tokens',
+}
+
+ENTITY_TO_TABLE_TS_COLUMNS_MAP = {
+    EntityType.BLOCK: 'timestamp',
+    EntityType.TOKEN: 'created_block_timestamp',
+    EntityType.TRANSACTION: 'block_timestamp',
+    EntityType.LOG: 'block_timestamp',
+    EntityType.TOKEN_TRANSFER: 'block_timestamp',
+    EntityType.TRACE: 'block_timestamp',
+    EntityType.GETH_TRACE: 'block_timestamp',
+    EntityType.CONTRACT: 'block_timestamp',
+}
diff --git a/ethereumetl/deduplication/clickhouse.py b/ethereumetl/deduplication/clickhouse.py
@@ -0,0 +1,46 @@
+import os
+import clickhouse_connect
+import logging
+
+class Clickhouse:
+    """
+    Clickhouse Connector
+    """
+
+    def __init__(self):
+        """
+        Connect to database and provide it's client
+        :param host:
+        :param port:
+        :param username:
+        :param password:
+        :param database:
+        """
+        logging.debug('Connecting to clickhouse !!')
+
+        self._host = os.environ['CLICKHOUSE_HOST']
+        self._port = os.environ['CLICKHOUSE_PORT']
+        self._username = os.environ['CLICKHOUSE_USERNAME']
+        self._password = os.environ['CLICKHOUSE_PASSWORD']
+        self._database = os.environ['CLICKHOUSE_DATABASE']
+
+        logging.debug(
+            f'Making Connection to DB with host: {self._host}, port: {self._port}, database: {self._database}'
+        )
+        self._connection = clickhouse_connect.get_client(host=self._host, port=self._port, secure=True,
+                                                         username=self._username, password=self._password, database=self._database)        
+
+
+    async def run_query(self, query, parameters):
+        """Function to run query on clickhouse
+
+        Args:
+            query (str): query to run
+            parameters (dict): variable parameters
+
+        Returns:
+            list: fetched data
+        """
+        logging.debug(f'Running SQL Query {query}')
+        result = self._connection.query(query=query, parameters=parameters)
+        return result.result_rows
diff --git a/ethereumetl/deduplication/deduplicate.py b/ethereumetl/deduplication/deduplicate.py
@@ -0,0 +1,67 @@
+import os
+import logging
+import asyncio
+from datetime import datetime
+from ethereumetl.constants import constants
+
+def deduplicate_records(records, ts_key, db):
+    ch_fallback_days = int(os.environ.get('CLICKHOUSE_FALLBACK_DAYS', constants.CLICKHOUSE_FALLBACK_DAYS))
+
+    if records == None or len(records) == 0:
+        return records
+
+    min_ts = get_minimum_ts(records, ts_key)
+    if is_ts_older(min_ts, ch_fallback_days):
+        records = asyncio.run(filter_records(records, min_ts, db))
+    return records
+
+def is_ts_older(ts, days):
+    difference = datetime.utcnow() - datetime.utcfromtimestamp(ts)
+    return difference.days > days    
+
+async def filter_records(items, min_ts_epoch, db):
+    if items == None or len(items) == 0:
+        return items
+
+    message_type = items[0].get('type')
+    skip_message_types = os.environ.get('CLICKHOUSE_SKIP_FOR_MESSAGE_TYPES')
+
+    if skip_message_types != None and (message_type in skip_message_types.split(',')):
+        logging.info(f'Ignoring check for deduplication for type {message_type} as it is ignored')
+        return items
+
+    table_name = get_table_name(message_type)
+    ts_column_name = get_table_ts_column_name(message_type)
+    if table_name == None:
+        logging.warn(f'Ignoring check for deduplication for type {message_type} as table not found')
+        return items
+
+    min_ts = datetime.utcfromtimestamp(min_ts_epoch).strftime('%Y-%m-%d')
+
+    # extract all ids
+    ids = list([obj["id"] for obj in items])
+    ids_from_db = []
+
+    parameters = { 'table': table_name, 'ids': [], 'timestamp_key': ts_column_name, 'block_timestamp': min_ts }
+    query = '''SELECT id FROM {table:Identifier} WHERE id IN {ids:Array(String)} and {timestamp_key:Identifier} >= {block_timestamp:String}'''
+
+    chunk_size = int(os.environ.get('CLICKHOUSE_QUERY_CHUNK_SIZE', constants.CLICKHOUSE_QUERY_CHUNK_SIZE))
+    for i in range(0, len(ids), chunk_size):
+        chunk = ids[i:i + chunk_size]
+        parameters['ids'] = chunk
+
+        db_results = await db.run_query(query, parameters)
+        ids_from_db = ids_from_db + [t[0] for t in db_results]
+
+    return [item for item in items if item['id'] not in ids_from_db]
+
+def get_table_name(message_type):
+    return constants.ENTITY_TO_TABLE_MAP.get(message_type)
+
+def get_table_ts_column_name(message_type):
+    return constants.ENTITY_TO_TABLE_TS_COLUMNS_MAP.get(message_type)
+
+def get_minimum_ts(items, key):
+    # get timestamp of oldest message from items list
+    record = min(items, key=lambda x: x.get(key, float('inf')))
+    return record.get(key)
diff --git a/ethereumetl/deduplication/redis.py b/ethereumetl/deduplication/redis.py
@@ -0,0 +1,27 @@
+import os
+import hashlib
+import redis
+from ethereumetl.constants import constants
+
+class RedisConnector:
+    def __init__(self):
+        self.ttl = os.environ['REDIS_MESSAGE_TTL']
+
+        redis_host = os.environ['REDIS_HOST']
+        redis_port = os.environ['REDIS_PORT']
+        redis_database = os.environ['REDIS_DB']
+
+        self.redis_client = redis.StrictRedis(host=redis_host, port=redis_port, db=redis_database)
+
+    def exists_in_set(self, key, value):
+        return self.redis_client.exists(self.create_key(key, value))
+
+    def add_to_set(self, key, value):
+        return self.redis_client.setex(self.create_key(key, value), self.ttl, '1')
+
+    def create_key(self, key, value):
+        hashed_data = hashlib.sha1(f"{key}_{value}".encode()).hexdigest()
+        return f"{constants.REDIS_PREFIX}_{hashed_data}"
+
+    def close(self):
+        self.redis_client.close()
diff --git a/ethereumetl/domain/trace.py b/ethereumetl/domain/trace.py
@@ -23,6 +23,8 @@
 
 class EthTrace(object):
     def __init__(self):
+        self.before_evm_transfers = None
+        self.after_evm_transfers = None
         self.block_number = None
         self.transaction_hash = None
         self.transaction_index = None

diff --git a/ethereumetl/jobs/export_geth_traces_job.py b/ethereumetl/jobs/export_geth_traces_job.py
@@ -73,6 +73,7 @@ def _export_batch(self, block_number_batch):
 
             # add tx_hash to tx_trace
             for obj in result:
+                # TODO: fix this error
                 obj['result']['tx_hash'] = obj.get('txHash')
                 trace_error = obj.get('result').get('error')
                 if trace_error is not None:

diff --git a/ethereumetl/jobs/extract_contracts_job.py b/ethereumetl/jobs/extract_contracts_job.py
@@ -59,7 +59,7 @@ def _extract_contracts(self, traces):
             trace['block_number'] = to_int_or_none(trace.get('block_number'))
 
         contract_creation_traces = [trace for trace in traces
-                                    if trace.get('trace_type') == 'create' and trace.get('to_address') is not None
+                                    if (trace.get('trace_type') == 'create' or trace.get('trace_type') == 'create2') and trace.get('to_address') is not None
                                     and len(trace.get('to_address')) > 0 and trace.get('status') == 1]
 
         contracts = []