|
@@ -42,13 +42,13 @@ class Master:
|
42
|
42
|
return self.spark \
|
43
|
43
|
.read \
|
44
|
44
|
.table(self.CLUSTERS_TABLE) \
|
45
|
|
- .groupBy("parent") \
|
|
45
|
+ .groupBy("id") \
|
46
|
46
|
.agg(F.collect_set('address').alias('addresses'))
|
47
|
47
|
|
48
|
48
|
def insertNewCluster (self, addrs: Iterable[str], root: str | None = None) -> str:
|
49
|
49
|
if(root == None):
|
50
|
50
|
root = addrs[0]
|
51
|
|
- df = self.spark.createDataFrame(map(lambda addr: (addr, root), addrs), schema=['address', 'parent'])
|
|
51
|
+ df = self.spark.createDataFrame(map(lambda addr: (addr, root), addrs), schema=['address', 'id'])
|
52
|
52
|
df.writeTo(self.CLUSTERS_TABLE).append()
|
53
|
53
|
return root
|
54
|
54
|
|
|
@@ -58,14 +58,14 @@ class Master:
|
58
|
58
|
.zipWithIndex() \
|
59
|
59
|
.toDF(["tx_group", "index"])
|
60
|
60
|
|
61
|
|
- def rewrite_cluster_parent(self, cluster_roots: Iterable[str], new_cluster_root: str) -> None:
|
|
61
|
+ def rewrite_cluster_id(self, cluster_roots: Iterable[str], new_cluster_root: str) -> None:
|
62
|
62
|
cluster_rewrite = self.spark \
|
63
|
63
|
.table(self.CLUSTERS_TABLE) \
|
64
|
|
- .where(F.col('parent').isin(cluster_roots)) \
|
|
64
|
+ .where(F.col('id').isin(cluster_roots)) \
|
65
|
65
|
.select('address') \
|
66
|
66
|
.rdd \
|
67
|
67
|
.map(lambda addr: (addr['address'], new_cluster_root)) \
|
68
|
|
- .toDF(['address', 'parent']) \
|
|
68
|
+ .toDF(['address', 'id']) \
|
69
|
69
|
|
70
|
70
|
if(debug):
|
71
|
71
|
print("REWRITE JOB")
|
|
@@ -73,24 +73,22 @@ class Master:
|
73
|
73
|
print()
|
74
|
74
|
|
75
|
75
|
cluster_rewrite.writeTo(self.CLUSTERS_TABLE).append()
|
76
|
|
-
|
77
|
|
-
|
78
|
76
|
# end class Master
|
79
|
77
|
|
80
|
78
|
|
81
|
79
|
"""
|
82
|
80
|
tuple structure:
|
83
|
|
- Row => Row(parent=addr, addresses=list[addr] | the cluster
|
|
81
|
+ Row => Row(id=addr, addresses=list[addr] | the cluster
|
84
|
82
|
Iterable[str] => list[addr] | the transaction addresses
|
85
|
83
|
"""
|
86
|
84
|
def find(data: tuple[Row, Iterable[str]]) -> str | None:
|
87
|
85
|
cluster = data[0]
|
88
|
86
|
tx = data[1]
|
89
|
87
|
|
90
|
|
- clusteraddresses = cluster['addresses'] + [cluster['parent']]
|
|
88
|
+ clusteraddresses = cluster['addresses'] + [cluster['id']]
|
91
|
89
|
|
92
|
90
|
if any(x in tx for x in clusteraddresses):
|
93
|
|
- return cluster['parent']
|
|
91
|
+ return cluster['id']
|
94
|
92
|
else:
|
95
|
93
|
return None
|
96
|
94
|
|
|
@@ -149,7 +147,7 @@ for i in range(0, tx_addr_groups.count()):
|
149
|
147
|
elif(len(matched_roots) == 1):
|
150
|
148
|
master.insertNewCluster(tx_addrs, matched_roots[0])
|
151
|
149
|
else:
|
152
|
|
- master.rewrite_cluster_parent(matched_roots[1:], matched_roots[0])
|
|
150
|
+ master.rewrite_cluster_id(matched_roots[1:], matched_roots[0])
|
153
|
151
|
master.insertNewCluster(tx_addrs, matched_roots[0])
|
154
|
152
|
|
155
|
153
|
if(debug):
|