Quellcode durchsuchen

union find with partition clustering

master
nitowa vor 2 Jahren
Ursprung
Commit
1b1c134cdf
2 geänderte Dateien mit 9 neuen und 12 gelöschten Zeilen
  1. 0
    1
      src/spark/main.py
  2. 9
    11
      src/spark/main_partition.py

+ 0
- 1
src/spark/main.py Datei anzeigen

@@ -69,7 +69,6 @@ tx_grouped = tx_df \
69 69
     .groupBy('tx_id') \
70 70
     .agg(F.collect_set('address').alias('addresses'))
71 71
 
72
-tx_grouped.rdd.mapPartitions(cluster_id_addresses_rows)
73 72
 
74 73
 # TODO: Load clusters from DB, check if any exist, if no make initial cluster, else proceed with loaded data
75 74
 

+ 9
- 11
src/spark/main_partition.py Datei anzeigen

@@ -48,21 +48,19 @@ def cluster_step(clusters: "List[List[str]]", addresses: "List[List[str]]"):
48 48
     if(len(addresses) == 0):
49 49
         return clusters
50 50
 
51
-    #take a set of addresses
52 51
     tx = addresses[0]
53
-    #remove it from list candidates
54
-    addresses = addresses[1:]
52
+    matching_clusters = []
53
+    new_clusters = []
55 54
 
56
-    #find clusters that match these addresses
57
-    matching_clusters = filter(lambda cluster: check_lists_overlap(tx, cluster), clusters)
58
-    
59
-    #remove all clusters that match these addresses
60
-    clusters = list(filter(lambda cluster: not check_lists_overlap(tx, cluster), clusters))
55
+    for cluster in clusters:
56
+        if(check_lists_overlap(tx, cluster)):
57
+            matching_clusters.append(cluster)
58
+        else:
59
+            new_clusters.append(cluster)
61 60
 
62
-    #add a new cluster that is the union of found clusters and the inspected list of addresses
63
-    clusters.append(merge_lists_distinct(tx, *matching_clusters))
61
+    new_clusters.append(merge_lists_distinct(tx, *matching_clusters))
64 62
 
65
-    return cluster_step(clusters,addresses)
63
+    return cluster_step(new_clusters,addresses[1:])
66 64
 
67 65
 def cluster_partition(iter: "Iterable[Row]") -> Iterable:
68 66
     yield cluster_step([], list(map(lambda row: row['addresses'], iter)))

Laden…
Abbrechen
Speichern