Browse Source

union find with partition clustering

master
nitowa 1 year ago
parent
commit
1b1c134cdf
2 changed files with 9 additions and 12 deletions
  1. 0
    1
      src/spark/main.py
  2. 9
    11
      src/spark/main_partition.py

+ 0
- 1
src/spark/main.py View File

69
     .groupBy('tx_id') \
69
     .groupBy('tx_id') \
70
     .agg(F.collect_set('address').alias('addresses'))
70
     .agg(F.collect_set('address').alias('addresses'))
71
 
71
 
72
-tx_grouped.rdd.mapPartitions(cluster_id_addresses_rows)
73
 
72
 
74
 # TODO: Load clusters from DB, check if any exist, if no make initial cluster, else proceed with loaded data
73
 # TODO: Load clusters from DB, check if any exist, if no make initial cluster, else proceed with loaded data
75
 
74
 

+ 9
- 11
src/spark/main_partition.py View File

48
     if(len(addresses) == 0):
48
     if(len(addresses) == 0):
49
         return clusters
49
         return clusters
50
 
50
 
51
-    #take a set of addresses
52
     tx = addresses[0]
51
     tx = addresses[0]
53
-    #remove it from list candidates
54
-    addresses = addresses[1:]
52
+    matching_clusters = []
53
+    new_clusters = []
55
 
54
 
56
-    #find clusters that match these addresses
57
-    matching_clusters = filter(lambda cluster: check_lists_overlap(tx, cluster), clusters)
58
-    
59
-    #remove all clusters that match these addresses
60
-    clusters = list(filter(lambda cluster: not check_lists_overlap(tx, cluster), clusters))
55
+    for cluster in clusters:
56
+        if(check_lists_overlap(tx, cluster)):
57
+            matching_clusters.append(cluster)
58
+        else:
59
+            new_clusters.append(cluster)
61
 
60
 
62
-    #add a new cluster that is the union of found clusters and the inspected list of addresses
63
-    clusters.append(merge_lists_distinct(tx, *matching_clusters))
61
+    new_clusters.append(merge_lists_distinct(tx, *matching_clusters))
64
 
62
 
65
-    return cluster_step(clusters,addresses)
63
+    return cluster_step(new_clusters,addresses[1:])
66
 
64
 
67
 def cluster_partition(iter: "Iterable[Row]") -> Iterable:
65
 def cluster_partition(iter: "Iterable[Row]") -> Iterable:
68
     yield cluster_step([], list(map(lambda row: row['addresses'], iter)))
66
     yield cluster_step([], list(map(lambda row: row['addresses'], iter)))

Loading…
Cancel
Save