Browse Source

union find with partition clustering

master
nitowa 2 years ago
parent
commit
0069a95487
1 changed files with 4 additions and 10 deletions
  1. 4
    10
      src/spark/main_partition.py

+ 4
- 10
src/spark/main_partition.py View File

64
 
64
 
65
     return cluster_step(clusters,addresses)
65
     return cluster_step(clusters,addresses)
66
 
66
 
67
-
68
-def cluster_id_addresses_rows(iter: "Iterable[Row]") -> Iterable:
69
-    address_lists = list(map(lambda row: row['addresses'], iter))
70
-    yield cluster_step([], address_lists)
67
+def cluster_partition(iter: "Iterable[Row]") -> Iterable:
68
+    yield cluster_step([], list(map(lambda row: row['addresses'], iter)))
71
     
69
     
72
-def dud(iter):
73
-    address_lists = list(map(lambda row: row['addresses'], iter))
74
-    yield address_lists
75
-
76
 master = Master(config)
70
 master = Master(config)
77
 master.spark.catalog.clearCache()
71
 master.spark.catalog.clearCache()
78
 master.spark.sparkContext.setCheckpointDir(config['spark_checkpoint_dir'])
72
 master.spark.sparkContext.setCheckpointDir(config['spark_checkpoint_dir'])
84
     .agg(F.collect_set('address').alias('addresses')) \
78
     .agg(F.collect_set('address').alias('addresses')) \
85
     .orderBy('tx_id') \
79
     .orderBy('tx_id') \
86
 
80
 
87
-print()
88
 res = tx_grouped \
81
 res = tx_grouped \
89
     .repartition(5) \
82
     .repartition(5) \
90
     .rdd \
83
     .rdd \
91
-    .mapPartitions(cluster_id_addresses_rows) \
84
+    .mapPartitions(cluster_partition) \
92
     .fold([], cluster_step)
85
     .fold([], cluster_step)
93
 
86
 
94
 for cluster in res:
87
 for cluster in res:
88
+    print()
95
     print(sorted(cluster))
89
     print(sorted(cluster))
96
 
90
 
97
 end = time.time()
91
 end = time.time()

Loading…
Cancel
Save