1 year ago · 8c4b598043
--- a/settings.json
+++ b/settings.json
@@ -2,12 +2,16 @@
 
				
				     "cassandra_addresses": ["127.0.0.1"],
			
 
				
				     "cassandra_port": 9042,
			
 
				
				     "cassandra_keyspace": "distributedunionfind",
			
 
				
				+    "cassandra_catalog": "DUFCatalog",
			
 
				
				 
			
 
				
				     "setup_db_dir": "config/db",
			
 
				
				     "setup_tables_dir": "config/db/tables",
			
 
				
				     "setup_keyspace_dir": "config/db/keyspace",
			
 
				
				 
			
 
				
				     "tx_table_name": "transactions",
			
 
				
				-    "clusters_table_name": "clusters"
			
 
				
				+    "clusters_table_name": "clusters",
			
 
				
				 
			
 
				
				+    "spark_master": "spark://osboxes:7077",
			
 
				
				+
			
 
				
				+    "debug": true
			
 
				
				 }
			
--- a/src/spark/main.py
+++ b/src/spark/main.py
@@ -1,44 +1,89 @@
 
				
				 from gc import collect
			
 
				
				-from sqlite3 import Row
			
 
				
				-from typing import Iterable
			
 
				
				-from operator import add
			
 
				
				-
			
 
				
				-from pyspark.sql import SparkSession
			
 
				
				-from pyspark.sql import functions as F
			
 
				
				-
			
 
				
				+import json
			
 
				
				 
			
 
				
				+from sqlite3 import Row
			
 
				
				+from typing import Iterable, List
			
 
				
				 
			
 
				
				-spark = SparkSession.builder \
			
 
				
				-    .appName('SparkCassandraApp') \
			
 
				
				-    .config('spark.cassandra.connection.host', 'localhost') \
			
 
				
				-    .config('spark.cassandra.connection.port', '9042') \
			
 
				
				-    .config('spark.cassandra.output.consistency.level', 'ONE') \
			
 
				
				-    .config("spark.sql.extensions",  "com.datastax.spark.connector.CassandraSparkExtensions") \
			
 
				
				-    .config('directJoinSetting', 'on') \
			
 
				
				-    .master('spark://osboxes:7077') \
			
 
				
				-    .getOrCreate()
			
 
				
				-
			
 
				
				-spark.conf.set("spark.sql.catalog.myCatalog",
			
 
				
				-               "com.datastax.spark.connector.datasource.CassandraCatalog")
			
 
				
				-
			
 
				
				+from pyspark import RDD
			
 
				
				 
			
 
				
				-tx_addr_groups = spark.read.table("myCatalog.distributedunionfind.transactions") \
			
 
				
				-    .groupBy("tx_id") \
			
 
				
				-    .agg(F.collect_set('address').alias('addresses')) \
			
 
				
				-    .toLocalIterator()
			
 
				
				+from pyspark.sql import SparkSession, DataFrame, Row
			
 
				
				+from pyspark.sql import functions as F
			
 
				
				 
			
 
				
				-def insertCluster (row):
			
 
				
				-    addrs: Iterable[str] = row['addresses']
			
 
				
				-    df = spark.createDataFrame(map(lambda addr: (addr, addrs[0]), addrs), schema=['address', 'parent'])
			
 
				
				+config = json.load(open("./settings.json"))
			
 
				
				+debug = config['debug']
			
 
				
				+
			
 
				
				+class Master:
			
 
				
				+    spark: SparkSession
			
 
				
				+    CLUSTERS_TABLE: str
			
 
				
				+    TX_TABLE: str
			
 
				
				+
			
 
				
				+    def __init__(self, config):
			
 
				
				+        self.spark = self.makeSparkContext(config)
			
 
				
				+        self.config = config
			
 
				
				+        self.CLUSTERS_TABLE = f"{config['cassandra_catalog']}.{config['cassandra_keyspace']}.{config['clusters_table_name']}"
			
 
				
				+        self.TX_TABLE = f"{config['cassandra_catalog']}.{config['cassandra_keyspace']}.{config['tx_table_name']}"
			
 
				
				+
			
 
				
				+    def makeSparkContext(self,config) -> SparkSession:
			
 
				
				+        return SparkSession.builder \
			
 
				
				+        .appName('SparkCassandraApp') \
			
 
				
				+        .config('spark.cassandra.connection.host', ','.join(config['cassandra_addresses'])) \
			
 
				
				+        .config('spark.cassandra.connection.port', config["cassandra_port"]) \
			
 
				
				+        .config('spark.cassandra.output.consistency.level', 'ONE') \
			
 
				
				+        .config("spark.sql.extensions",  "com.datastax.spark.connector.CassandraSparkExtensions") \
			
 
				
				+        .config(f"spark.sql.catalog.{config['cassandra_catalog']}", "com.datastax.spark.connector.datasource.CassandraCatalog") \
			
 
				
				+        .config('directJoinSetting', 'on') \
			
 
				
				+        .master(config['spark_master']) \
			
 
				
				+        .getOrCreate()
			
 
				
				+
			
 
				
				+    def group_tx_addrs(self) -> DataFrame:
			
 
				
				+        return self.spark \
			
 
				
				+            .read \
			
 
				
				+            .table(self.TX_TABLE) \
			
 
				
				+            .groupBy("tx_id") \
			
 
				
				+            .agg(F.collect_set('address').alias('addresses'))
			
 
				
				+
			
 
				
				+    def group_cluster_addrs(self) -> DataFrame:
			
 
				
				+        return self.spark \
			
 
				
				+            .read \
			
 
				
				+            .table(self.CLUSTERS_TABLE) \
			
 
				
				+            .groupBy("parent") \
			
 
				
				+            .agg(F.collect_set('address').alias('addresses'))
			
 
				
				+
			
 
				
				+    def insertNewCluster (self, addrs: Iterable[str], root: str | None = None) -> str:
			
 
				
				+        if(root == None):
			
 
				
				+            root = addrs[0]
			
 
				
				+        df = self.spark.createDataFrame(map(lambda addr: (addr, root), addrs), schema=['address', 'parent'])
			
 
				
				+        df.writeTo(self.CLUSTERS_TABLE).append()
			
 
				
				+        return root
			
 
				
				+
			
 
				
				+    def enumerate(self, data: DataFrame) -> DataFrame:
			
 
				
				+        return data \
			
 
				
				+            .rdd \
			
 
				
				+            .zipWithIndex() \
			
 
				
				+            .toDF(["tx_group", "index"])
			
 
				
				+
			
 
				
				+    def rewrite_cluster_parent(self, cluster_roots: Iterable[str], new_cluster_root: str) -> None:
			
 
				
				+        sqlstr = f"""
			
 
				
				+            UPDATE {self.CLUSTERS_TABLE} 
			
 
				
				+            SET parent='{new_cluster_root}' 
			
 
				
				+            WHERE parent IN ({','.join(map(lambda r: f"'{r}'", cluster_roots))})"""
			
 
				
				+        
			
 
				
				+        if(debug):
			
 
				
				+            print("UPDATE SQL")
			
 
				
				+            print(sqlstr)
			
 
				
				+            print()
			
 
				
				+
			
 
				
				+        self.spark.sql(sqlstr)
			
 
				
				+
			
 
				
				+# end class Master
			
 
				
				 
			
 
				
				-    df.writeTo("myCatalog.distributedunionfind.clusters").overwrite()
			
 
				
				 
			
 
				
				 """
			
 
				
				     tuple structure:
			
 
				
				-        Row => Row(parent=addr, addresses=list[addr]
			
 
				
				-        Iterable[str] => list[addr]
			
 
				
				+        Row => Row(parent=addr, addresses=list[addr] | the cluster
			
 
				
				+        Iterable[str] => list[addr] | the transaction addresses
			
 
				
				 """
			
 
				
				-def find(data: tuple[Row, Iterable[str]]):
			
 
				
				+def find(data: tuple[Row, Iterable[str]]) -> str | None:
			
 
				
				     cluster = data[0]
			
 
				
				     tx = data[1]
			
 
				
				 
			
@@ -49,28 +94,94 @@ def find(data: tuple[Row, Iterable[str]]):
 
				
				     else:
			
 
				
				         return None
			
 
				
				 
			
 
				
				-for addr_group in tx_addr_groups:
			
 
				
				-    clusters_df = spark.read.table("myCatalog.distributedunionfind.clusters")
			
 
				
				+def handleTx(tx_addr_group: Row):
			
 
				
				 
			
 
				
				-    clusters = clusters_df \
			
 
				
				-        .groupBy("parent") \
			
 
				
				-        .agg(F.collect_set('address').alias('addresses'))
			
 
				
				 
			
 
				
				-    if (clusters.count() == 0):
			
 
				
				-        insertCluster(addr_group)
			
 
				
				+    found_clusters: "RDD[str]" = clusters.rdd \
			
 
				
				+        .map(lambda cluster: (cluster, tx_addr_group['addresses'])) \
			
 
				
				+        .map(find) \
			
 
				
				+        .filter(lambda x: x != None)
			
 
				
				+        
			
 
				
				+
			
 
				
				+    if(found_clusters.count() == 0):
			
 
				
				+        insertNewCluster(tx_addr_group)
			
 
				
				+        return
			
 
				
				+
			
 
				
				+    cluster_roots = found_clusters.collect()
			
 
				
				+
			
 
				
				+    cl = clusters \
			
 
				
				+        .select('addresses') \
			
 
				
				+        .where(
			
 
				
				+            F.col('parent').isin(cluster_roots)
			
 
				
				+        ) \
			
 
				
				+        .agg(F.collect_set('addresses').alias('agg')) \
			
 
				
				+        .select(F.flatten('agg').alias('addresses')) \
			
 
				
				+        .select(F.explode('addresses')) \
			
 
				
				+        .rdd \
			
 
				
				+        .map(lambda addr: (addr, cluster_roots[0])) \
			
 
				
				+        .toDF(['address', 'parent']) \
			
 
				
				+        .show()
			
 
				
				+        #.writeTo(CLUSTERS_TABLE) \
			
 
				
				+        #.append()
			
 
				
				+
			
 
				
				+
			
 
				
				+master = Master(config)
			
 
				
				+
			
 
				
				+tx_addr_groups = master.group_tx_addrs()
			
 
				
				+tx_groups_indexed = master.enumerate(tx_addr_groups)
			
 
				
				+
			
 
				
				+for i in range(0, tx_addr_groups.count()):
			
 
				
				+    cluster_addr_groups = master.group_cluster_addrs()
			
 
				
				+
			
 
				
				+    if(debug):
			
 
				
				+        print("KNOWN CLUSTERS")
			
 
				
				+        cluster_addr_groups.show(truncate=False)
			
 
				
				+        print()
			
 
				
				+
			
 
				
				+    tx_addrs: Iterable[str] = tx_groups_indexed \
			
 
				
				+        .where(tx_groups_indexed.index == i) \
			
 
				
				+        .select('tx_group') \
			
 
				
				+        .collect()[0]['tx_group']['addresses']
			
 
				
				+
			
 
				
				+    if(debug):
			
 
				
				+        print("CURRENT TX")
			
 
				
				+        print(tx_addrs)
			
 
				
				+        print()
			
 
				
				+
			
 
				
				+    if (cluster_addr_groups.count() == 0):
			
 
				
				+        master.insertNewCluster(tx_addrs)
			
 
				
				         continue
			
 
				
				 
			
 
				
				-    df = clusters.rdd \
			
 
				
				-        .map(lambda cluster: (cluster, addr_group['addresses'])) \
			
 
				
				+    cluster_tx_mapping = cluster_addr_groups \
			
 
				
				+        .rdd \
			
 
				
				+        .map(lambda cluster: (cluster, tx_addrs)) 
			
 
				
				+
			
 
				
				+    if(debug):
			
 
				
				+        print("cluster_tx_mapping")
			
 
				
				+        cluster_tx_mapping \
			
 
				
				+            .toDF(['cluster', 'tx']) \
			
 
				
				+            .show(truncate=False)
			
 
				
				+        print()
			
 
				
				+
			
 
				
				+
			
 
				
				+    matched_roots: "List[str]" = cluster_tx_mapping \
			
 
				
				         .map(find) \
			
 
				
				-        .filter(lambda x: x != None) \
			
 
				
				+        .filter(lambda root: root != None) \
			
 
				
				         .collect()
			
 
				
				 
			
 
				
				-    if(len(df) == 0):
			
 
				
				-        insertCluster(addr_group)
			
 
				
				-        continue
			
 
				
				+    if(debug):
			
 
				
				+        print("FOUND ROOTS")
			
 
				
				+        print(matched_roots)
			
 
				
				+        print()
			
 
				
				+
			
 
				
				 
			
 
				
				-    print(addr_group)
			
 
				
				-    print(df)
			
 
				
				+    if(len(matched_roots) == 0):
			
 
				
				+        new_root = master.insertNewCluster(tx_addrs)
			
 
				
				+    elif(len(matched_roots) == 1):
			
 
				
				+        master.insertNewCluster(tx_addrs, matched_roots[0])
			
 
				
				+    else:
			
 
				
				+        master.rewrite_cluster_parent(matched_roots[1:], matched_roots[0])
			
 
				
				+        master.insertNewCluster(tx_addrs, matched_roots[0])
			
 
				
				 
			
 
				
				-    break
			
 
				
				+    if(debug):
			
 
				
				+        print("==============")
			
--- a/start_services.sh
+++ b/start_services.sh
@@ -1,6 +1,9 @@
 
				
				 SPARK_HOME="/home/osboxes/Downloads/spark-3.2.2-bin-hadoop3.2"
			
 
				
				 SPARK_MASTER="spark://osboxes:7077"
			
 
				
				 
			
 
				
				+echo "Starting spark master..."
			
 
				
				 "$SPARK_HOME"/sbin/start-master.sh
			
 
				
				+echo "Starting spark workers..."
			
 
				
				 SPARK_WORKER_INSTANCES=5 "$SPARK_HOME"/sbin/start-worker.sh "$SPARK_MASTER"
			
 
				
				+echo "Starting cassandra container..."
			
 
				
				 docker run -d -p 9042:9042 cassandra
			
--- a/submit.sh
+++ b/submit.sh
@@ -1,5 +1,5 @@
 
				
				 SPARK_HOME="/home/osboxes/Downloads/spark-3.2.2-bin-hadoop3.2"
			
 
				
				-MEMORY="4g"
			
 
				
				+MEMORY="1g"
			
 
				
				 SPARK_MASTER="spark://osboxes:7077"
			
 
				
				 CASSANDRA_HOST="localhost"