3 年前 · 9cb6827c5e
--- a/src/spark/main.py
+++ b/src/spark/main.py
@@ -1,9 +1,4 @@
 
				
				-from gc import collect
			
 
				
				 import json
			
 
				
				-from select import select
			
 
				
				-
			
 
				
				-from sqlite3 import Row
			
 
				
				-from typing import Iterable, List
			
 
				
				 
			
 
				
				 from pyspark.sql import SparkSession, DataFrame, Row
			
 
				
				 from pyspark.sql import functions as F
			
@@ -56,46 +51,18 @@ class Master:
 
				
				             .rdd \
			
 
				
				             .flatMap(lambda row: list(map(lambda elem: (elem,), row[column]))) \
			
 
				
				             .toDF([column])
			
 
				
				-
			
 
				
				-    def array_col_to_elements(self, df: DataFrame, column: str, distinct:bool = False) -> DataFrame:
			
 
				
				-        exploded = master.explode_array_col(
			
 
				
				-            df,
			
 
				
				-            column
			
 
				
				-        )
			
 
				
				-
			
 
				
				-        #this is likely redundant
			
 
				
				-        collected = master.collect_col_to_array(
			
 
				
				-            exploded, 
			
 
				
				-            column, 
			
 
				
				-            distinct
			
 
				
				-        )
			
 
				
				-
			
 
				
				-        return self.explode_array_col(
			
 
				
				-            collected,
			
 
				
				-            column
			
 
				
				-        )
			
 
				
				-
			
 
				
				 # end class Master
			
 
				
				 
			
 
				
				 
			
 
				
				 master = Master(config)
			
 
				
				+master.spark.catalog.clearCache()
			
 
				
				+master.spark.sparkContext.setCheckpointDir('./checkpoints')
			
 
				
				 tx_df = master.get_tx_dataframe()
			
 
				
				 
			
 
				
				-
			
 
				
				 #Turn transactions into a list of ('id', [addr, addr, ...])
			
 
				
				 tx_grouped = tx_df \
			
 
				
				     .groupBy('tx_id') \
			
 
				
				-    .agg(F.collect_set('address').alias('addresses')) \
			
 
				
				-    .rdd \
			
 
				
				-    .zipWithIndex() \
			
 
				
				-    .toDF(['tx', 'index']) \
			
 
				
				-    .select(
			
 
				
				-        F.col('tx.tx_id').alias('tx_id'),
			
 
				
				-        F.col('tx.addresses').alias('addresses'),
			
 
				
				-        'index'
			
 
				
				-    ) \
			
 
				
				-    .cache()
			
 
				
				-
			
 
				
				+    .agg(F.collect_set('address').alias('addresses'))
			
 
				
				 
			
 
				
				 # TODO: Load clusters from DB, check if any exist, if no make initial cluster, else proceed with loaded data
			
 
				
				 
			
@@ -103,29 +70,39 @@ tx_grouped = tx_df \
 
				
				 
			
 
				
				 # take the first tx
			
 
				
				 tx_zero = tx_grouped \
			
 
				
				-    .select(tx_grouped.tx_id, tx_grouped.addresses) \
			
 
				
				-    .where(tx_grouped.index == 0)
			
 
				
				+    .select('*') \
			
 
				
				+    .where('tx_id = 3') \
			
 
				
				+    .limit(1)
			
 
				
				 
			
 
				
				 # find txs with overlapping addresses
			
 
				
				 overlapping_txs = tx_grouped \
			
 
				
				-    .where((tx_grouped.index != 0)) \
			
 
				
				-    .join(tx_zero.withColumnRenamed('addresses', 'tx_addresses')) \
			
 
				
				+    .join(
			
 
				
				+        tx_zero \
			
 
				
				+            .withColumnRenamed('addresses', 'tx_addresses') \
			
 
				
				+            .withColumnRenamed('tx_id', 'overlap_id')
			
 
				
				+    ) \
			
 
				
				     .select(
			
 
				
				-        tx_grouped.index,
			
 
				
				+        tx_grouped.tx_id,
			
 
				
				         tx_grouped.addresses,
			
 
				
				         F.arrays_overlap(tx_grouped.addresses, 'tx_addresses').alias('overlap')
			
 
				
				     ) \
			
 
				
				     .where(F.col('overlap') == True) \
			
 
				
				+    .drop('overlap')
			
 
				
				 
			
 
				
				 # overlapped txs must not be considered anymore, so remove them candidate dataframe
			
 
				
				 tx_grouped = tx_grouped \
			
 
				
				-    .join(overlapping_txs, 'index', 'leftanti') \
			
 
				
				-    .filter(tx_grouped.index != 0)
			
 
				
				+    .join(
			
 
				
				+        overlapping_txs.drop('addresses'), 
			
 
				
				+        'tx_id', 
			
 
				
				+        'leftanti'
			
 
				
				+    )
			
 
				
				 
			
 
				
				 # get the distinct addresses of all overlaps in a single array
			
 
				
				 distinct_addresses = master.reduce_concat_array_column(
			
 
				
				     master.union_single_col(
			
 
				
				-        overlapping_txs, tx_zero, column='addresses'
			
 
				
				+        overlapping_txs, 
			
 
				
				+        tx_zero, 
			
 
				
				+        column='addresses'
			
 
				
				     ), 
			
 
				
				     column='addresses',
			
 
				
				     distinct=True,
			
@@ -144,7 +121,7 @@ clusters_grouped = cluster \
 
				
				     .groupBy('id') \
			
 
				
				     .agg(F.collect_list('address').alias('addresses'))
			
 
				
				 
			
 
				
				-def take_tx_and_cluster(txs: DataFrame, clusters: DataFrame):
			
 
				
				+def take_tx_and_cluster(txs: DataFrame, clusters: DataFrame, n=0):
			
 
				
				     if (txs.count() == 0):  # done!
			
 
				
				         return clusters
			
 
				
				 
			
@@ -158,33 +135,43 @@ def take_tx_and_cluster(txs: DataFrame, clusters: DataFrame):
 
				
				         .select(
			
 
				
				             clusters.id,
			
 
				
				             clusters.addresses,
			
 
				
				+            'tx_addresses',
			
 
				
				             F.arrays_overlap(clusters.addresses,'tx_addresses').alias('overlap')
			
 
				
				         ) \
			
 
				
				         .where(F.col('overlap') == True)
			
 
				
				 
			
 
				
				+    clusters_union_tx = master.union_single_col(tx, overlapping_clusters, 'addresses')
			
 
				
				+
			
 
				
				     #collect all addresses into single array field
			
 
				
				-    new_cluster_arr = master.reduce_concat_array_column(
			
 
				
				-        master.union_single_col(tx, overlapping_clusters, 'addresses'),
			
 
				
				+    new_cluster_arrays = master.reduce_concat_array_column(
			
 
				
				+        clusters_union_tx,
			
 
				
				         column='addresses',
			
 
				
				         distinct=True
			
 
				
				     )
			
 
				
				 
			
 
				
				     #declare cluster representative
			
 
				
				-    new_cluster = new_cluster_arr \
			
 
				
				+    new_cluster = new_cluster_arrays \
			
 
				
				         .rdd \
			
 
				
				         .flatMap(lambda row: list(map(lambda addr: (addr, row['addresses'][0]), row['addresses']))) \
			
 
				
				         .toDF(['address', 'id']) \
			
 
				
				         .groupBy('id') \
			
 
				
				         .agg(F.collect_list('address').alias('addresses'))
			
 
				
				 
			
 
				
				+    txs = txs.join(tx, 'tx_id', 'leftanti')
			
 
				
				+    clusters = clusters.join(overlapping_clusters, 'id', 'leftanti').union(new_cluster)
			
 
				
				+
			
 
				
				+    #the RDD legacy (internal history tracker) gets too big as iterations continue, use checkpoint to prune it regularly
			
 
				
				+    if(n % 3 == 0):
			
 
				
				+        txs = txs.checkpoint()
			
 
				
				+        clusters = clusters.checkpoint()
			
 
				
				+
			
 
				
				     #start new round with txs minus the one just used, and updated clusters
			
 
				
				-    return take_tx_and_cluster(
			
 
				
				-        txs.join(tx, 'index', 'leftanti'), 
			
 
				
				-        clusters.join(overlapping_clusters, 'id', 'leftanti').union(new_cluster)
			
 
				
				-    )
			
 
				
				+    return take_tx_and_cluster(txs,clusters,n+1)
			
 
				
				 
			
 
				
				 
			
 
				
				-take_tx_and_cluster(tx_grouped, clusters_grouped).show()
			
 
				
				+result = take_tx_and_cluster(tx_grouped, clusters_grouped).collect()
			
 
				
				+for row in result:
			
 
				
				+    print(sorted(row['addresses']))
			
 
				
				 
			
 
				
				 end = time.time()
			
 
				
				 print("ELAPSED TIME:", end-start)