|
@@ -0,0 +1,98 @@
|
|
1
|
+import json
|
|
2
|
+from typing import Iterable, List, Set
|
|
3
|
+
|
|
4
|
+from pyspark.sql import SparkSession, DataFrame, Row
|
|
5
|
+from pyspark.sql import functions as F
|
|
6
|
+
|
|
7
|
+import time
|
|
8
|
+start = time.time()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+config = json.load(open("./settings.json"))
|
|
12
|
+debug = config['debug']
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+class Master:
|
|
16
|
+ spark: SparkSession
|
|
17
|
+ CLUSTERS_TABLE: str
|
|
18
|
+ TX_TABLE: str
|
|
19
|
+
|
|
20
|
+ def __init__(self, config):
|
|
21
|
+ self.spark = self.makeSparkContext(config)
|
|
22
|
+ self.config = config
|
|
23
|
+ self.CLUSTERS_TABLE = f"{config['cassandra_catalog']}.{config['cassandra_keyspace']}.{config['clusters_table_name']}"
|
|
24
|
+ self.TX_TABLE = f"{config['cassandra_catalog']}.{config['cassandra_keyspace']}.{config['tx_table_name']}"
|
|
25
|
+
|
|
26
|
+ def makeSparkContext(self, config) -> SparkSession:
|
|
27
|
+ return SparkSession.builder \
|
|
28
|
+ .appName('SparkCassandraApp') \
|
|
29
|
+ .config(f"spark.sql.catalog.{config['cassandra_catalog']}", "com.datastax.spark.connector.datasource.CassandraCatalog") \
|
|
30
|
+ .getOrCreate()
|
|
31
|
+
|
|
32
|
+ def get_tx_dataframe(self) -> DataFrame:
|
|
33
|
+ return self.spark.table(self.TX_TABLE)
|
|
34
|
+
|
|
35
|
+# end class Master
|
|
36
|
+
|
|
37
|
+def merge_lists_distinct(*lists: "Iterable[List[str]]") -> List[str]:
|
|
38
|
+ accum = set()
|
|
39
|
+ for lst in lists:
|
|
40
|
+ accum = accum.union(set(lst))
|
|
41
|
+ return list(accum)
|
|
42
|
+
|
|
43
|
+def check_lists_overlap(list1, list2):
|
|
44
|
+ return any(x in list1 for x in list2)
|
|
45
|
+
|
|
46
|
+def cluster_step(clusters: "List[List[str]]", addresses: "List[List[str]]"):
|
|
47
|
+ #if there are no more sets of addresses to consider, we are done
|
|
48
|
+ if(len(addresses) == 0):
|
|
49
|
+ return clusters
|
|
50
|
+
|
|
51
|
+ #take a set of addresses
|
|
52
|
+ tx = addresses[0]
|
|
53
|
+ #remove it from list candidates
|
|
54
|
+ addresses = addresses[1:]
|
|
55
|
+
|
|
56
|
+ #find clusters that match these addresses
|
|
57
|
+ matching_clusters = filter(lambda cluster: check_lists_overlap(tx, cluster), clusters)
|
|
58
|
+
|
|
59
|
+ #remove all clusters that match these addresses
|
|
60
|
+ clusters = list(filter(lambda cluster: not check_lists_overlap(tx, cluster), clusters))
|
|
61
|
+
|
|
62
|
+ #add a new cluster that is the union of found clusters and the inspected list of addresses
|
|
63
|
+ clusters.append(merge_lists_distinct(tx, *matching_clusters))
|
|
64
|
+
|
|
65
|
+ return cluster_step(clusters,addresses)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+def cluster_id_addresses_rows(iter: "Iterable[Row]") -> Iterable:
|
|
69
|
+ address_lists = list(map(lambda row: row['addresses'], iter))
|
|
70
|
+ yield cluster_step([], address_lists)
|
|
71
|
+
|
|
72
|
+def dud(iter):
|
|
73
|
+ address_lists = list(map(lambda row: row['addresses'], iter))
|
|
74
|
+ yield address_lists
|
|
75
|
+
|
|
76
|
+master = Master(config)
|
|
77
|
+master.spark.catalog.clearCache()
|
|
78
|
+master.spark.sparkContext.setCheckpointDir(config['spark_checkpoint_dir'])
|
|
79
|
+tx_df = master.get_tx_dataframe()
|
|
80
|
+
|
|
81
|
+#Turn transactions into a list of ('id', [addr, addr, ...])
|
|
82
|
+tx_grouped = tx_df \
|
|
83
|
+ .groupBy('tx_id') \
|
|
84
|
+ .agg(F.collect_set('address').alias('addresses')) \
|
|
85
|
+ .orderBy('tx_id') \
|
|
86
|
+
|
|
87
|
+print()
|
|
88
|
+res = tx_grouped \
|
|
89
|
+ .repartition(5) \
|
|
90
|
+ .rdd \
|
|
91
|
+ .mapPartitions(cluster_id_addresses_rows) \
|
|
92
|
+ .fold([], cluster_step)
|
|
93
|
+
|
|
94
|
+for cluster in res:
|
|
95
|
+ print(sorted(cluster))
|
|
96
|
+
|
|
97
|
+end = time.time()
|
|
98
|
+print("ELAPSED TIME:", end-start)
|