2 years ago · d42f70d33c
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 
				
				 __pycache__
			
 
				
				 .vscode
			
 
				
				 checkpoints
			
 
				
				-spark-warehouse
			
 
				
				+spark-warehouse
			
 
				
				+scratchpad.py
			
--- a/README.md
+++ b/README.md
@@ -8,13 +8,17 @@ TODO
 
				
				 
			
 
				
				 - Python3
			
 
				
				 - Apache spark 3.2 (https://spark.apache.org/downloads.html)
			
 
				
				-- Cassandra DB (https://cassandra.apache.org/_/index.html, locally the docker build is recommended: https://hub.docker.com/_/cassandra)
			
 
				
				+- Cassandra DB (https://cassandra.apache.org/\_/index.html, locally the docker build is recommended: https://hub.docker.com/\_/cassandra)
			
 
				
				 
			
 
				
				-For the graph implementation specifically you need to install `graphframes` manually since the official release is incompatible with `spark 3.x` (pull request pending). A prebuilt copy is supplied in the `spark-packages` directory. 
			
 
				
				+For the graph implementation specifically you need to install `graphframes` manually from a third party since the official release is incompatible with `spark 3.x` ([pull request pending](https://github.com/graphframes/graphframes/pull/415)). A prebuilt copy is supplied in the `spark-packages` directory. 
			
 
				
				 - graphframes (https://github.com/eejbyfeldt/graphframes/tree/spark-3.3)
			
 
				
				 
			
 
				
				 ## Setting up
			
 
				
				 
			
 
				
				-- Modify `settings.json` to reflect your setup. If you are running everything locally you can use `start_services.sh` to turn everything on in one swoop.
			
 
				
				-- Load the development database by running `python3 setup.py` from the project root.
			
 
				
				-- Start the spark workload by either running `submit.sh` (slow) or `submit_graph.sh` (faster)
			
 
				
				+- Modify `settings.json` to reflect your setup. If you are running everything locally you can use `start_services.sh` to turn everything on in one swoop. It might take a few minutes for Cassandra to become available.
			
 
				
				+- Load the development database by running `python3 setup.py` from the project root. Per default this will move `small_test_data.csv` into the transactions table.
			
 
				
				+
			
 
				
				+# Deploying:
			
 
				
				+
			
 
				
				+- Start the spark workload by either running `submit.sh` (slow) or `submit_graph.sh` (faster)
			
 
				
				+- If you need to clean out the Database you can run `python3 clean.py`. Be wary that this wipes all data.
			
--- a/config/db/tables/clusters/CREATE.sql
+++ b/config/db/tables/clusters/CREATE.sql
@@ -1,5 +1,5 @@
 
				
				 CREATE TABLE clusters(
			
 
				
				     address TEXT,
			
 
				
				-    parent TEXT,
			
 
				
				+    id TEXT,
			
 
				
				     PRIMARY KEY (address)
			
 
				
				 );
			
--- a/src/spark/main.py
+++ b/src/spark/main.py
@@ -42,13 +42,13 @@ class Master:
 
				
				         return self.spark \
			
 
				
				             .read \
			
 
				
				             .table(self.CLUSTERS_TABLE) \
			
 
				
				-            .groupBy("parent") \
			
 
				
				+            .groupBy("id") \
			
 
				
				             .agg(F.collect_set('address').alias('addresses'))
			
 
				
				 
			
 
				
				     def insertNewCluster (self, addrs: Iterable[str], root: str | None = None) -> str:
			
 
				
				         if(root == None):
			
 
				
				             root = addrs[0]
			
 
				
				-        df = self.spark.createDataFrame(map(lambda addr: (addr, root), addrs), schema=['address', 'parent'])
			
 
				
				+        df = self.spark.createDataFrame(map(lambda addr: (addr, root), addrs), schema=['address', 'id'])
			
 
				
				         df.writeTo(self.CLUSTERS_TABLE).append()
			
 
				
				         return root
			
 
				
				 
			
@@ -58,14 +58,14 @@ class Master:
 
				
				             .zipWithIndex() \
			
 
				
				             .toDF(["tx_group", "index"])
			
 
				
				 
			
 
				
				-    def rewrite_cluster_parent(self, cluster_roots: Iterable[str], new_cluster_root: str) -> None:
			
 
				
				+    def rewrite_cluster_id(self, cluster_roots: Iterable[str], new_cluster_root: str) -> None:
			
 
				
				         cluster_rewrite = self.spark \
			
 
				
				             .table(self.CLUSTERS_TABLE) \
			
 
				
				-            .where(F.col('parent').isin(cluster_roots)) \
			
 
				
				+            .where(F.col('id').isin(cluster_roots)) \
			
 
				
				             .select('address') \
			
 
				
				             .rdd \
			
 
				
				             .map(lambda addr: (addr['address'], new_cluster_root)) \
			
 
				
				-            .toDF(['address', 'parent']) \
			
 
				
				+            .toDF(['address', 'id']) \
			
 
				
				         
			
 
				
				         if(debug):
			
 
				
				             print("REWRITE JOB")
			
@@ -73,24 +73,22 @@ class Master:
 
				
				             print()
			
 
				
				 
			
 
				
				         cluster_rewrite.writeTo(self.CLUSTERS_TABLE).append()
			
 
				
				-        
			
 
				
				-        
			
 
				
				 # end class Master
			
 
				
				 
			
 
				
				 
			
 
				
				 """
			
 
				
				     tuple structure:
			
 
				
				-        Row => Row(parent=addr, addresses=list[addr] | the cluster
			
 
				
				+        Row => Row(id=addr, addresses=list[addr] | the cluster
			
 
				
				         Iterable[str] => list[addr] | the transaction addresses
			
 
				
				 """
			
 
				
				 def find(data: tuple[Row, Iterable[str]]) -> str | None:
			
 
				
				     cluster = data[0]
			
 
				
				     tx = data[1]
			
 
				
				 
			
 
				
				-    clusteraddresses = cluster['addresses'] + [cluster['parent']]
			
 
				
				+    clusteraddresses = cluster['addresses'] + [cluster['id']]
			
 
				
				 
			
 
				
				     if any(x in tx for x in clusteraddresses):
			
 
				
				-        return cluster['parent']
			
 
				
				+        return cluster['id']
			
 
				
				     else:
			
 
				
				         return None
			
 
				
				 
			
@@ -149,7 +147,7 @@ for i in range(0, tx_addr_groups.count()):
 
				
				     elif(len(matched_roots) == 1):
			
 
				
				         master.insertNewCluster(tx_addrs, matched_roots[0])
			
 
				
				     else:
			
 
				
				-        master.rewrite_cluster_parent(matched_roots[1:], matched_roots[0])
			
 
				
				+        master.rewrite_cluster_id(matched_roots[1:], matched_roots[0])
			
 
				
				         master.insertNewCluster(tx_addrs, matched_roots[0])
			
 
				
				 
			
 
				
				     if(debug):
			
--- a/src/spark/main_graphs.py
+++ b/src/spark/main_graphs.py
@@ -29,52 +29,55 @@ class Master:
 
				
				             .config(f"spark.sql.catalog.{config['cassandra_catalog']}", "com.datastax.spark.connector.datasource.CassandraCatalog") \
			
 
				
				             .getOrCreate()
			
 
				
				 
			
 
				
				-    def empty_dataframe(self, schema) -> DataFrame:
			
 
				
				-        return self.spark.createDataFrame(self.spark.sparkContext.emptyRDD(), schema)
			
 
				
				-
			
 
				
				     def get_tx_dataframe(self) -> DataFrame:
			
 
				
				         return self.spark.table(self.TX_TABLE)
			
 
				
				 
			
 
				
				     def get_cluster_dataframe(self) -> DataFrame:
			
 
				
				         return self.spark.table(self.CLUSTERS_TABLE)
			
 
				
				 
			
 
				
				-# end class Master
			
 
				
				+    def write_connected_components_as_clusters(self, conn_comp: DataFrame) -> None:
			
 
				
				+        conn_comp \
			
 
				
				+            .withColumnRenamed('id', 'address') \
			
 
				
				+            .withColumnRenamed('component', 'id') \
			
 
				
				+            .writeTo(self.CLUSTERS_TABLE) \
			
 
				
				+            .append()
			
 
				
				 
			
 
				
				+# end class Master
			
 
				
				 
			
 
				
				 master = Master(config)
			
 
				
				-master.spark.sparkContext.setCheckpointDir(
			
 
				
				-    './checkpoints')  # spark is really adamant it needs this
			
 
				
				+master.spark.sparkContext.setCheckpointDir('./checkpoints')  # spark is really adamant it needs this even if the algorithm is set to the non-checkpointed version
			
 
				
				 
			
 
				
				-# Vertex DataFrame
			
 
				
				-transaction_as_vertices = master.get_tx_dataframe() \
			
 
				
				+tx_df = master.get_tx_dataframe()
			
 
				
				+
			
 
				
				+transaction_as_vertices =  tx_df \
			
 
				
				     .select('address') \
			
 
				
				     .withColumnRenamed('address', 'id') \
			
 
				
				     .distinct()
			
 
				
				 
			
 
				
				 def explode_row(row: Row) -> List[Row]:
			
 
				
				     addresses = row['addresses']
			
 
				
				-    if(len(addresses) == 1):
			
 
				
				-        return []
			
 
				
				-
			
 
				
				     return list(map(lambda addr: (addr, addresses[0]), addresses[1:]))
			
 
				
				 
			
 
				
				-
			
 
				
				-tx_groups = master.get_tx_dataframe() \
			
 
				
				+transactions_as_edges = tx_df \
			
 
				
				     .groupBy("tx_id") \
			
 
				
				-    .agg(F.collect_set('address').alias('addresses'))
			
 
				
				-
			
 
				
				-transactions_as_edges = tx_groups \
			
 
				
				+    .agg(F.collect_set('address').alias('addresses')) \
			
 
				
				     .rdd \
			
 
				
				     .flatMap(explode_row) \
			
 
				
				     .toDF(['src', 'dst'])
			
 
				
				 
			
 
				
				-
			
 
				
				-# Create a GraphFrame
			
 
				
				 g = GraphFrame(transaction_as_vertices, transactions_as_edges)
			
 
				
				-res = g.connectedComponents().groupBy('component').agg(F.collect_list('id')).collect()
			
 
				
				+components = g.connectedComponents(algorithm='graphframes')
			
 
				
				+
			
 
				
				+master.write_connected_components_as_clusters(components)
			
 
				
				+
			
 
				
				+if(debug):
			
 
				
				+    clusters = components \
			
 
				
				+        .groupBy('component') \
			
 
				
				+        .agg(F.collect_list('id')) \
			
 
				
				+        .collect()
			
 
				
				 
			
 
				
				-for row in res:
			
 
				
				-    print(sorted(row['collect_list(id)']))
			
 
				
				+    for cluster in clusters:
			
 
				
				+        print(sorted(cluster['collect_list(id)'])) 
			
 
				
				 
			
 
				
				 end = time.time()
			
 
				
				 print("ELAPSED TIME:", end-start)