Browse Source

union find with partition clustering

master
nitowa 1 year ago
parent
commit
9c1ac98ebf
3 changed files with 124 additions and 0 deletions
  1. 7
    0
      src/spark/main.py
  2. 98
    0
      src/spark/main_partition.py
  3. 19
    0
      submit_partition.sh

+ 7
- 0
src/spark/main.py View File

@@ -1,4 +1,5 @@
1 1
 import json
2
+from typing import Iterable
2 3
 
3 4
 from pyspark.sql import SparkSession, DataFrame, Row
4 5
 from pyspark.sql import functions as F
@@ -54,6 +55,10 @@ class Master:
54 55
 # end class Master
55 56
 
56 57
 
58
+def cluster_id_addresses_rows(iter: "Iterable[Row]") -> Iterable:
59
+    return iter
60
+    
61
+
57 62
 master = Master(config)
58 63
 master.spark.catalog.clearCache()
59 64
 master.spark.sparkContext.setCheckpointDir(config['spark_checkpoint_dir'])
@@ -64,6 +69,8 @@ tx_grouped = tx_df \
64 69
     .groupBy('tx_id') \
65 70
     .agg(F.collect_set('address').alias('addresses'))
66 71
 
72
+tx_grouped.rdd.mapPartitions(cluster_id_addresses_rows)
73
+
67 74
 # TODO: Load clusters from DB, check if any exist, if no make initial cluster, else proceed with loaded data
68 75
 
69 76
 # find initial cluster

+ 98
- 0
src/spark/main_partition.py View File

@@ -0,0 +1,98 @@
1
+import json
2
+from typing import Iterable, List, Set
3
+
4
+from pyspark.sql import SparkSession, DataFrame, Row
5
+from pyspark.sql import functions as F
6
+
7
+import time
8
+start = time.time()
9
+
10
+
11
+config = json.load(open("./settings.json"))
12
+debug = config['debug']
13
+
14
+
15
+class Master:
16
+    spark: SparkSession
17
+    CLUSTERS_TABLE: str
18
+    TX_TABLE: str
19
+
20
+    def __init__(self, config):
21
+        self.spark = self.makeSparkContext(config)
22
+        self.config = config
23
+        self.CLUSTERS_TABLE = f"{config['cassandra_catalog']}.{config['cassandra_keyspace']}.{config['clusters_table_name']}"
24
+        self.TX_TABLE = f"{config['cassandra_catalog']}.{config['cassandra_keyspace']}.{config['tx_table_name']}"
25
+
26
+    def makeSparkContext(self, config) -> SparkSession:
27
+        return SparkSession.builder \
28
+            .appName('SparkCassandraApp') \
29
+            .config(f"spark.sql.catalog.{config['cassandra_catalog']}", "com.datastax.spark.connector.datasource.CassandraCatalog") \
30
+            .getOrCreate()
31
+
32
+    def get_tx_dataframe(self) -> DataFrame:
33
+        return self.spark.table(self.TX_TABLE)
34
+
35
+# end class Master
36
+
37
+def merge_lists_distinct(*lists: "Iterable[List[str]]") -> List[str]:
38
+    accum = set()
39
+    for lst in lists:
40
+        accum = accum.union(set(lst))
41
+    return list(accum)
42
+
43
+def check_lists_overlap(list1, list2):
44
+    return any(x in list1 for x in list2)
45
+
46
+def cluster_step(clusters: "List[List[str]]", addresses: "List[List[str]]"):
47
+    #if there are no more sets of addresses to consider, we are done
48
+    if(len(addresses) == 0):
49
+        return clusters
50
+
51
+    #take a set of addresses
52
+    tx = addresses[0]
53
+    #remove it from list candidates
54
+    addresses = addresses[1:]
55
+
56
+    #find clusters that match these addresses
57
+    matching_clusters = filter(lambda cluster: check_lists_overlap(tx, cluster), clusters)
58
+    
59
+    #remove all clusters that match these addresses
60
+    clusters = list(filter(lambda cluster: not check_lists_overlap(tx, cluster), clusters))
61
+
62
+    #add a new cluster that is the union of found clusters and the inspected list of addresses
63
+    clusters.append(merge_lists_distinct(tx, *matching_clusters))
64
+
65
+    return cluster_step(clusters,addresses)
66
+
67
+
68
+def cluster_id_addresses_rows(iter: "Iterable[Row]") -> Iterable:
69
+    address_lists = list(map(lambda row: row['addresses'], iter))
70
+    yield cluster_step([], address_lists)
71
+    
72
+def dud(iter):
73
+    address_lists = list(map(lambda row: row['addresses'], iter))
74
+    yield address_lists
75
+
76
+master = Master(config)
77
+master.spark.catalog.clearCache()
78
+master.spark.sparkContext.setCheckpointDir(config['spark_checkpoint_dir'])
79
+tx_df = master.get_tx_dataframe()
80
+
81
+#Turn transactions into a list of ('id', [addr, addr, ...])
82
+tx_grouped = tx_df \
83
+    .groupBy('tx_id') \
84
+    .agg(F.collect_set('address').alias('addresses')) \
85
+    .orderBy('tx_id') \
86
+
87
+print()
88
+res = tx_grouped \
89
+    .repartition(5) \
90
+    .rdd \
91
+    .mapPartitions(cluster_id_addresses_rows) \
92
+    .fold([], cluster_step)
93
+
94
+for cluster in res:
95
+    print(sorted(cluster))
96
+
97
+end = time.time()
98
+print("ELAPSED TIME:", end-start)

+ 19
- 0
submit_partition.sh View File

@@ -0,0 +1,19 @@
1
+SPARK_HOME=$(python3 -c 'import json,sys;config=json.load(open("./settings.json"));print(config["spark_home"])')
2
+MEMORY=$(python3 -c 'import json,sys;config=json.load(open("./settings.json"));print(config["spark_worker_memory"])')
3
+SPARK_MASTER=$(python3 -c 'import json,sys;config=json.load(open("./settings.json"));print(config["spark_master"])')
4
+CASSANDRA_HOST=$(python3 -c 'import json,sys;config=json.load(open("./settings.json"));print(",".join(config["cassandra_addresses"]))')
5
+CASSANDRA_PORT=$(python3 -c 'import json,sys;config=json.load(open("./settings.json"));print(config["cassandra_port"])')
6
+CASSANDRA_OUT_CONSISTENCY=$(python3 -c 'import json,sys;config=json.load(open("./settings.json"));print(config["cassandra_output_consistency"])')
7
+EVENT_LOGGING=$(python3 -c 'import json,sys;config=json.load(open("./settings.json"));print(config["spark_event_logging"])')
8
+
9
+"$SPARK_HOME"/bin/spark-submit \
10
+--master "$SPARK_MASTER" \
11
+--conf spark.executor.memory="$MEMORY" \
12
+--conf spark.cassandra.connection.host="$CASSANDRA_HOST" \
13
+--conf spark.cassandra.connection.port="$CASSANDRA_PORT" \
14
+--conf spark.cassandra.output.consistency.level="$CASSANDRA_OUT_CONSISTENCY" \
15
+--conf spark.eventLog.enabled="$EVENT_LOGGING" \
16
+--conf spark.sql.session.timeZone=UTC \
17
+--conf spark.sql.extensions=com.datastax.spark.connector.CassandraSparkExtensions \
18
+--packages com.datastax.spark:spark-cassandra-connector_2.12:3.2.0 \
19
+./src/spark/main_partition.py

Loading…
Cancel
Save