2 years ago · 183723e46f
--- a/src/spark/main.py
+++ b/src/spark/main.py
@@ -1,5 +1,6 @@
 
				
				 from gc import collect
			
 
				
				 import json
			
 
				
				+from select import select
			
 
				
				 
			
 
				
				 from sqlite3 import Row
			
 
				
				 from typing import Iterable, List
			
@@ -14,6 +15,7 @@ start = time.time()
 
				
				 config = json.load(open("./settings.json"))
			
 
				
				 debug = config['debug']
			
 
				
				 
			
 
				
				+
			
 
				
				 class Master:
			
 
				
				     spark: SparkSession
			
 
				
				     CLUSTERS_TABLE: str
			
@@ -25,133 +27,164 @@ class Master:
 
				
				         self.CLUSTERS_TABLE = f"{config['cassandra_catalog']}.{config['cassandra_keyspace']}.{config['clusters_table_name']}"
			
 
				
				         self.TX_TABLE = f"{config['cassandra_catalog']}.{config['cassandra_keyspace']}.{config['tx_table_name']}"
			
 
				
				 
			
 
				
				-    def makeSparkContext(self,config) -> SparkSession:
			
 
				
				+    def makeSparkContext(self, config) -> SparkSession:
			
 
				
				         return SparkSession.builder \
			
 
				
				-        .appName('SparkCassandraApp') \
			
 
				
				-        .config(f"spark.sql.catalog.{config['cassandra_catalog']}", "com.datastax.spark.connector.datasource.CassandraCatalog") \
			
 
				
				-        .getOrCreate()
			
 
				
				-
			
 
				
				-    def group_tx_addrs(self) -> DataFrame:
			
 
				
				-        return self.spark \
			
 
				
				-            .read \
			
 
				
				-            .table(self.TX_TABLE) \
			
 
				
				-            .groupBy("tx_id") \
			
 
				
				-            .agg(F.collect_set('address').alias('addresses'))
			
 
				
				-
			
 
				
				-    def group_cluster_addrs(self) -> DataFrame:
			
 
				
				-        return self.spark \
			
 
				
				-            .read \
			
 
				
				-            .table(self.CLUSTERS_TABLE) \
			
 
				
				-            .groupBy("id") \
			
 
				
				-            .agg(F.collect_set('address').alias('addresses'))
			
 
				
				-
			
 
				
				-    def insertNewCluster (self, addrs: Iterable[str], root: str | None = None) -> str:
			
 
				
				-        if(root == None):
			
 
				
				-            root = addrs[0]
			
 
				
				-        df = self.spark.createDataFrame(map(lambda addr: (addr, root), addrs), schema=['address', 'id'])
			
 
				
				-        df.writeTo(self.CLUSTERS_TABLE).append()
			
 
				
				-        return root
			
 
				
				-
			
 
				
				-    def enumerate(self, data: DataFrame) -> DataFrame:
			
 
				
				-        return data \
			
 
				
				-            .rdd \
			
 
				
				-            .zipWithIndex() \
			
 
				
				-            .toDF(["tx_group", "index"])
			
 
				
				-
			
 
				
				-    def rewrite_cluster_id(self, cluster_roots: Iterable[str], new_cluster_root: str) -> None:
			
 
				
				-        cluster_rewrite = self.spark \
			
 
				
				-            .table(self.CLUSTERS_TABLE) \
			
 
				
				-            .where(F.col('id').isin(cluster_roots)) \
			
 
				
				-            .select('address') \
			
 
				
				+            .appName('SparkCassandraApp') \
			
 
				
				+            .config(f"spark.sql.catalog.{config['cassandra_catalog']}", "com.datastax.spark.connector.datasource.CassandraCatalog") \
			
 
				
				+            .getOrCreate()
			
 
				
				+
			
 
				
				+    def get_tx_dataframe(self) -> DataFrame:
			
 
				
				+        return self.spark.table(self.TX_TABLE)
			
 
				
				+
			
 
				
				+    def union_single_col(self, df1: DataFrame, df2: DataFrame, column: str) -> DataFrame:
			
 
				
				+        return df1 \
			
 
				
				+            .select(column) \
			
 
				
				+            .union(df2.select(column))
			
 
				
				+    
			
 
				
				+    def reduce_concat_array_column(self, df: DataFrame, column: str, distinct:bool = False) -> DataFrame:
			
 
				
				+        df = self.explode_array_col(df.select(column), column)
			
 
				
				+        return self.collect_col_to_array(df, column, distinct)
			
 
				
				+
			
 
				
				+    def collect_col_to_array(self, df: DataFrame, column: str, distinct: bool = False) -> DataFrame:
			
 
				
				+        if(distinct):    
			
 
				
				+            return df.select(F.collect_set(column).alias(column))
			
 
				
				+        else:
			
 
				
				+            return df.select(F.collect_list(column).alias(column))
			
 
				
				+    
			
 
				
				+    def explode_array_col(self, df: DataFrame, column: str) -> DataFrame:
			
 
				
				+        return df \
			
 
				
				             .rdd \
			
 
				
				-            .map(lambda addr: (addr['address'], new_cluster_root)) \
			
 
				
				-            .toDF(['address', 'id']) \
			
 
				
				-        
			
 
				
				-        if(debug):
			
 
				
				-            print("REWRITE JOB")
			
 
				
				-            cluster_rewrite.show(truncate=False, vertical=True)
			
 
				
				-            print()
			
 
				
				-
			
 
				
				-        cluster_rewrite.writeTo(self.CLUSTERS_TABLE).append()
			
 
				
				-# end class Master
			
 
				
				-
			
 
				
				-
			
 
				
				-"""
			
 
				
				-    tuple structure:
			
 
				
				-        Row => Row(id=addr, addresses=list[addr] | the cluster
			
 
				
				-        Iterable[str] => list[addr] | the transaction addresses
			
 
				
				-"""
			
 
				
				-def find(data: tuple[Row, Iterable[str]]) -> str | None:
			
 
				
				-    cluster = data[0]
			
 
				
				-    tx = data[1]
			
 
				
				+            .flatMap(lambda row: list(map(lambda elem: (elem,), row[column]))) \
			
 
				
				+            .toDF([column])
			
 
				
				+
			
 
				
				+    def array_col_to_elements(self, df: DataFrame, column: str, distinct:bool = False) -> DataFrame:
			
 
				
				+        exploded = master.explode_array_col(
			
 
				
				+            df,
			
 
				
				+            column
			
 
				
				+        )
			
 
				
				+
			
 
				
				+        #this is likely redundant
			
 
				
				+        collected = master.collect_col_to_array(
			
 
				
				+            exploded, 
			
 
				
				+            column, 
			
 
				
				+            distinct
			
 
				
				+        )
			
 
				
				+
			
 
				
				+        return self.explode_array_col(
			
 
				
				+            collected,
			
 
				
				+            column
			
 
				
				+        )
			
 
				
				 
			
 
				
				-    clusteraddresses = cluster['addresses'] + [cluster['id']]
			
 
				
				+# end class Master
			
 
				
				 
			
 
				
				-    if any(x in tx for x in clusteraddresses):
			
 
				
				-        return cluster['id']
			
 
				
				-    else:
			
 
				
				-        return None
			
 
				
				 
			
 
				
				 master = Master(config)
			
 
				
				-
			
 
				
				-tx_addr_groups = master.group_tx_addrs()
			
 
				
				-tx_groups_indexed = master.enumerate(tx_addr_groups).cache()
			
 
				
				-
			
 
				
				-for i in range(0, tx_addr_groups.count()):
			
 
				
				-    cluster_addr_groups = master.group_cluster_addrs()
			
 
				
				-
			
 
				
				-    if(debug):
			
 
				
				-        print("KNOWN CLUSTERS")
			
 
				
				-        cluster_addr_groups.show(truncate=True)
			
 
				
				-        print()
			
 
				
				-
			
 
				
				-    tx_addrs: Iterable[str] = tx_groups_indexed \
			
 
				
				-        .where(tx_groups_indexed.index == i) \
			
 
				
				-        .select('tx_group') \
			
 
				
				-        .collect()[0]['tx_group']['addresses']
			
 
				
				-
			
 
				
				-    if(debug):
			
 
				
				-        print("CURRENT TX")
			
 
				
				-        print(tx_addrs)
			
 
				
				-        print()
			
 
				
				-
			
 
				
				-    if (cluster_addr_groups.count() == 0):
			
 
				
				-        master.insertNewCluster(tx_addrs)
			
 
				
				-        continue
			
 
				
				-
			
 
				
				-    cluster_tx_mapping = cluster_addr_groups \
			
 
				
				+tx_df = master.get_tx_dataframe()
			
 
				
				+
			
 
				
				+
			
 
				
				+#Turn transactions into a list of ('id', [addr, addr, ...])
			
 
				
				+tx_grouped = tx_df \
			
 
				
				+    .groupBy('tx_id') \
			
 
				
				+    .agg(F.collect_set('address').alias('addresses')) \
			
 
				
				+    .rdd \
			
 
				
				+    .zipWithIndex() \
			
 
				
				+    .toDF(['tx', 'index']) \
			
 
				
				+    .select(
			
 
				
				+        F.col('tx.tx_id').alias('tx_id'),
			
 
				
				+        F.col('tx.addresses').alias('addresses'),
			
 
				
				+        'index'
			
 
				
				+    ) \
			
 
				
				+    .cache()
			
 
				
				+
			
 
				
				+
			
 
				
				+# TODO: Load clusters from DB, check if any exist, if no make initial cluster, else proceed with loaded data
			
 
				
				+
			
 
				
				+# find initial cluster
			
 
				
				+
			
 
				
				+# take the first tx
			
 
				
				+tx_zero = tx_grouped \
			
 
				
				+    .select(tx_grouped.tx_id, tx_grouped.addresses) \
			
 
				
				+    .where(tx_grouped.index == 0)
			
 
				
				+
			
 
				
				+# find txs with overlapping addresses
			
 
				
				+overlapping_txs = tx_grouped \
			
 
				
				+    .where((tx_grouped.index != 0)) \
			
 
				
				+    .join(tx_zero.withColumnRenamed('addresses', 'tx_addresses')) \
			
 
				
				+    .select(
			
 
				
				+        tx_grouped.index,
			
 
				
				+        tx_grouped.addresses,
			
 
				
				+        F.arrays_overlap(tx_grouped.addresses, 'tx_addresses').alias('overlap')
			
 
				
				+    ) \
			
 
				
				+    .where(F.col('overlap') == True) \
			
 
				
				+
			
 
				
				+# overlapped txs must not be considered anymore, so remove them candidate dataframe
			
 
				
				+tx_grouped = tx_grouped \
			
 
				
				+    .join(overlapping_txs, 'index', 'leftanti') \
			
 
				
				+    .filter(tx_grouped.index != 0)
			
 
				
				+
			
 
				
				+# get the distinct addresses of all overlaps in a single array
			
 
				
				+distinct_addresses = master.reduce_concat_array_column(
			
 
				
				+    master.union_single_col(
			
 
				
				+        overlapping_txs, tx_zero, column='addresses'
			
 
				
				+    ), 
			
 
				
				+    column='addresses',
			
 
				
				+    distinct=True,
			
 
				
				+)
			
 
				
				+
			
 
				
				+#pick out a random representative for this cluster and add it to every address 
			
 
				
				+cluster = distinct_addresses \
			
 
				
				+    .rdd \
			
 
				
				+    .flatMap(lambda row: list(map(lambda addr: (addr, row['addresses'][0]), row['addresses']))) \
			
 
				
				+    .toDF(['address', 'id'])
			
 
				
				+
			
 
				
				+# done finding initial cluster
			
 
				
				+
			
 
				
				+#group cluster by representative and transform the result into a list of shape ('id', ['addr', 'addr', ...])
			
 
				
				+clusters_grouped = cluster \
			
 
				
				+    .groupBy('id') \
			
 
				
				+    .agg(F.collect_list('address').alias('addresses'))
			
 
				
				+
			
 
				
				+def take_tx_and_cluster(txs: DataFrame, clusters: DataFrame):
			
 
				
				+    if (txs.count() == 0):  # done!
			
 
				
				+        return clusters
			
 
				
				+
			
 
				
				+    # take a random tx
			
 
				
				+    tx = txs \
			
 
				
				+        .select('*').limit(1)
			
 
				
				+
			
 
				
				+    # find clusters with overlapping addresses from tx
			
 
				
				+    overlapping_clusters = clusters \
			
 
				
				+        .join(tx.withColumnRenamed('addresses', 'tx_addresses')) \
			
 
				
				+        .select(
			
 
				
				+            clusters.id,
			
 
				
				+            clusters.addresses,
			
 
				
				+            F.arrays_overlap(clusters.addresses,'tx_addresses').alias('overlap')
			
 
				
				+        ) \
			
 
				
				+        .where(F.col('overlap') == True)
			
 
				
				+
			
 
				
				+    #collect all addresses into single array field
			
 
				
				+    new_cluster_arr = master.reduce_concat_array_column(
			
 
				
				+        master.union_single_col(tx, overlapping_clusters, 'addresses'),
			
 
				
				+        column='addresses',
			
 
				
				+        distinct=True
			
 
				
				+    )
			
 
				
				+
			
 
				
				+    #declare cluster representative
			
 
				
				+    new_cluster = new_cluster_arr \
			
 
				
				         .rdd \
			
 
				
				-        .map(lambda cluster: (cluster, tx_addrs))
			
 
				
				-
			
 
				
				-    if(debug):
			
 
				
				-        print("cluster_tx_mapping")
			
 
				
				-        cluster_tx_mapping \
			
 
				
				-            .toDF(['cluster', 'tx']) \
			
 
				
				-            .show(truncate=True)
			
 
				
				-        print()
			
 
				
				-
			
 
				
				-
			
 
				
				-    matched_roots: "List[str]" = cluster_tx_mapping \
			
 
				
				-        .map(find) \
			
 
				
				-        .filter(lambda root: root != None) \
			
 
				
				-        .collect()
			
 
				
				-
			
 
				
				-    if(debug):
			
 
				
				-        print("FOUND ROOTS")
			
 
				
				-        print(matched_roots)
			
 
				
				-        print()
			
 
				
				+        .flatMap(lambda row: list(map(lambda addr: (addr, row['addresses'][0]), row['addresses']))) \
			
 
				
				+        .toDF(['address', 'id']) \
			
 
				
				+        .groupBy('id') \
			
 
				
				+        .agg(F.collect_list('address').alias('addresses'))
			
 
				
				 
			
 
				
				+    #start new round with txs minus the one just used, and updated clusters
			
 
				
				+    return take_tx_and_cluster(
			
 
				
				+        txs.join(tx, 'index', 'leftanti'), 
			
 
				
				+        clusters.join(overlapping_clusters, 'id', 'leftanti').union(new_cluster)
			
 
				
				+    )
			
 
				
				 
			
 
				
				-    if(len(matched_roots) == 0):
			
 
				
				-        master.insertNewCluster(tx_addrs)
			
 
				
				-    elif(len(matched_roots) == 1):
			
 
				
				-        master.insertNewCluster(tx_addrs, matched_roots[0])
			
 
				
				-    else:
			
 
				
				-        master.rewrite_cluster_id(matched_roots[1:], matched_roots[0])
			
 
				
				-        master.insertNewCluster(tx_addrs, matched_roots[0])
			
 
				
				 
			
 
				
				-    if(debug):
			
 
				
				-        print("======================================================================")
			
 
				
				+take_tx_and_cluster(tx_grouped, clusters_grouped).show()
			
 
				
				 
			
 
				
				 end = time.time()
			
 
				
				-print("ELAPSED TIME:", end-start)
			
 
				
				+print("ELAPSED TIME:", end-start)
			
--- a/src/spark/main_bak.py
+++ b/src/spark/main_bak.py
@@ -0,0 +1,157 @@
 
				
				+from gc import collect
			
 
				
				+import json
			
 
				
				+
			
 
				
				+from sqlite3 import Row
			
 
				
				+from typing import Iterable, List
			
 
				
				+
			
 
				
				+from pyspark.sql import SparkSession, DataFrame, Row
			
 
				
				+from pyspark.sql import functions as F
			
 
				
				+
			
 
				
				+import time
			
 
				
				+start = time.time()
			
 
				
				+
			
 
				
				+
			
 
				
				+config = json.load(open("./settings.json"))
			
 
				
				+debug = config['debug']
			
 
				
				+
			
 
				
				+class Master:
			
 
				
				+    spark: SparkSession
			
 
				
				+    CLUSTERS_TABLE: str
			
 
				
				+    TX_TABLE: str
			
 
				
				+
			
 
				
				+    def __init__(self, config):
			
 
				
				+        self.spark = self.makeSparkContext(config)
			
 
				
				+        self.config = config
			
 
				
				+        self.CLUSTERS_TABLE = f"{config['cassandra_catalog']}.{config['cassandra_keyspace']}.{config['clusters_table_name']}"
			
 
				
				+        self.TX_TABLE = f"{config['cassandra_catalog']}.{config['cassandra_keyspace']}.{config['tx_table_name']}"
			
 
				
				+
			
 
				
				+    def makeSparkContext(self,config) -> SparkSession:
			
 
				
				+        return SparkSession.builder \
			
 
				
				+        .appName('SparkCassandraApp') \
			
 
				
				+        .config(f"spark.sql.catalog.{config['cassandra_catalog']}", "com.datastax.spark.connector.datasource.CassandraCatalog") \
			
 
				
				+        .getOrCreate()
			
 
				
				+
			
 
				
				+    def group_tx_addrs(self) -> DataFrame:
			
 
				
				+        return self.spark \
			
 
				
				+            .read \
			
 
				
				+            .table(self.TX_TABLE) \
			
 
				
				+            .groupBy("tx_id") \
			
 
				
				+            .agg(F.collect_set('address').alias('addresses'))
			
 
				
				+
			
 
				
				+    def group_cluster_addrs(self) -> DataFrame:
			
 
				
				+        return self.spark \
			
 
				
				+            .read \
			
 
				
				+            .table(self.CLUSTERS_TABLE) \
			
 
				
				+            .groupBy("id") \
			
 
				
				+            .agg(F.collect_set('address').alias('addresses'))
			
 
				
				+
			
 
				
				+    def insertNewCluster (self, addrs: Iterable[str], root: str | None = None) -> str:
			
 
				
				+        if(root == None):
			
 
				
				+            root = addrs[0]
			
 
				
				+        df = self.spark.createDataFrame(map(lambda addr: (addr, root), addrs), schema=['address', 'id'])
			
 
				
				+        df.writeTo(self.CLUSTERS_TABLE).append()
			
 
				
				+        return root
			
 
				
				+
			
 
				
				+    def enumerate(self, data: DataFrame) -> DataFrame:
			
 
				
				+        return data \
			
 
				
				+            .rdd \
			
 
				
				+            .zipWithIndex() \
			
 
				
				+            .toDF(["tx_group", "index"])
			
 
				
				+
			
 
				
				+    def rewrite_cluster_id(self, cluster_roots: Iterable[str], new_cluster_root: str) -> None:
			
 
				
				+        cluster_rewrite = self.spark \
			
 
				
				+            .table(self.CLUSTERS_TABLE) \
			
 
				
				+            .where(F.col('id').isin(cluster_roots)) \
			
 
				
				+            .select('address') \
			
 
				
				+            .rdd \
			
 
				
				+            .map(lambda addr: (addr['address'], new_cluster_root)) \
			
 
				
				+            .toDF(['address', 'id']) \
			
 
				
				+        
			
 
				
				+        if(debug):
			
 
				
				+            print("REWRITE JOB")
			
 
				
				+            cluster_rewrite.show(truncate=False, vertical=True)
			
 
				
				+            print()
			
 
				
				+
			
 
				
				+        cluster_rewrite.writeTo(self.CLUSTERS_TABLE).append()
			
 
				
				+# end class Master
			
 
				
				+
			
 
				
				+
			
 
				
				+"""
			
 
				
				+    tuple structure:
			
 
				
				+        Row => Row(id=addr, addresses=list[addr] | the cluster
			
 
				
				+        Iterable[str] => list[addr] | the transaction addresses
			
 
				
				+"""
			
 
				
				+def find(data: tuple[Row, Iterable[str]]) -> str | None:
			
 
				
				+    cluster = data[0]
			
 
				
				+    tx = data[1]
			
 
				
				+
			
 
				
				+    clusteraddresses = cluster['addresses'] + [cluster['id']]
			
 
				
				+
			
 
				
				+    if any(x in tx for x in clusteraddresses):
			
 
				
				+        return cluster['id']
			
 
				
				+    else:
			
 
				
				+        return None
			
 
				
				+
			
 
				
				+master = Master(config)
			
 
				
				+
			
 
				
				+tx_addr_groups = master.group_tx_addrs()
			
 
				
				+tx_groups_indexed = master.enumerate(tx_addr_groups).cache()
			
 
				
				+
			
 
				
				+for i in range(0, tx_addr_groups.count()):
			
 
				
				+    cluster_addr_groups = master.group_cluster_addrs()
			
 
				
				+
			
 
				
				+    if(debug):
			
 
				
				+        print("KNOWN CLUSTERS")
			
 
				
				+        cluster_addr_groups.show(truncate=True)
			
 
				
				+        print()
			
 
				
				+
			
 
				
				+    tx_addrs: Iterable[str] = tx_groups_indexed \
			
 
				
				+        .where(tx_groups_indexed.index == i) \
			
 
				
				+        .select('tx_group') \
			
 
				
				+        .collect()[0]['tx_group']['addresses']
			
 
				
				+
			
 
				
				+    if(debug):
			
 
				
				+        print("CURRENT TX")
			
 
				
				+        print(tx_addrs)
			
 
				
				+        print()
			
 
				
				+
			
 
				
				+    if (cluster_addr_groups.count() == 0):
			
 
				
				+        master.insertNewCluster(tx_addrs)
			
 
				
				+        continue
			
 
				
				+
			
 
				
				+    cluster_tx_mapping = cluster_addr_groups \
			
 
				
				+        .rdd \
			
 
				
				+        .map(lambda cluster: (cluster, tx_addrs))
			
 
				
				+
			
 
				
				+    if(debug):
			
 
				
				+        print("cluster_tx_mapping")
			
 
				
				+        cluster_tx_mapping \
			
 
				
				+            .toDF(['cluster', 'tx']) \
			
 
				
				+            .show(truncate=True)
			
 
				
				+        print()
			
 
				
				+
			
 
				
				+
			
 
				
				+    matched_roots: "List[str]" = cluster_tx_mapping \
			
 
				
				+        .map(find) \
			
 
				
				+        .filter(lambda root: root != None) \
			
 
				
				+        .collect()
			
 
				
				+
			
 
				
				+    if(debug):
			
 
				
				+        print("FOUND ROOTS")
			
 
				
				+        print(matched_roots)
			
 
				
				+        print()
			
 
				
				+
			
 
				
				+
			
 
				
				+    if(len(matched_roots) == 0):
			
 
				
				+        master.insertNewCluster(tx_addrs)
			
 
				
				+    elif(len(matched_roots) == 1):
			
 
				
				+        master.insertNewCluster(tx_addrs, matched_roots[0])
			
 
				
				+    else:
			
 
				
				+        master.rewrite_cluster_id(matched_roots[1:], matched_roots[0])
			
 
				
				+        master.insertNewCluster(tx_addrs, matched_roots[0])
			
 
				
				+
			
 
				
				+    if(debug):
			
 
				
				+        print("======================================================================")
			
 
				
				+
			
 
				
				+end = time.time()
			
 
				
				+print("ELAPSED TIME:", end-start)