3 years ago · d42f70d33c
--- a/.gitignore
+++ b/.gitignore
 
															
															 __pycache__
														
 
															
															 .vscode
														
 
															
															 checkpoints
														
 
															
															-spark-warehouse
														
 
															
															+spark-warehouse
														
 
															
															+scratchpad.py
														
--- a/README.md
+++ b/README.md
 
															
															 - Python3
														
 
															
															 - Apache spark 3.2 (https://spark.apache.org/downloads.html)
														
 
															
															-- Cassandra DB (https://cassandra.apache.org/_/index.html, locally the docker build is recommended: https://hub.docker.com/_/cassandra)
														
 
															
															+- Cassandra DB (https://cassandra.apache.org/\_/index.html, locally the docker build is recommended: https://hub.docker.com/\_/cassandra)
														
 
															
															-For the graph implementation specifically you need to install `graphframes` manually since the official release is incompatible with `spark 3.x` (pull request pending). A prebuilt copy is supplied in the `spark-packages` directory. 
														
 
															
															+For the graph implementation specifically you need to install `graphframes` manually from a third party since the official release is incompatible with `spark 3.x` ([pull request pending](https://github.com/graphframes/graphframes/pull/415)). A prebuilt copy is supplied in the `spark-packages` directory. 
														
 
															
															 - graphframes (https://github.com/eejbyfeldt/graphframes/tree/spark-3.3)
														
 
															
															 ## Setting up
														
 
															
															-- Modify `settings.json` to reflect your setup. If you are running everything locally you can use `start_services.sh` to turn everything on in one swoop.
														
 
															
															-- Load the development database by running `python3 setup.py` from the project root.
														
 
															
															-- Start the spark workload by either running `submit.sh` (slow) or `submit_graph.sh` (faster)
														
 
															
															+- Modify `settings.json` to reflect your setup. If you are running everything locally you can use `start_services.sh` to turn everything on in one swoop. It might take a few minutes for Cassandra to become available.
														
 
															
															+- Load the development database by running `python3 setup.py` from the project root. Per default this will move `small_test_data.csv` into the transactions table.
														
 
															
															+
														
 
															
															+# Deploying:
														
 
															
															+
														
 
															
															+- Start the spark workload by either running `submit.sh` (slow) or `submit_graph.sh` (faster)
														
 
															
															+- If you need to clean out the Database you can run `python3 clean.py`. Be wary that this wipes all data.
														
--- a/config/db/tables/clusters/CREATE.sql
+++ b/config/db/tables/clusters/CREATE.sql
 
															
															 CREATE TABLE clusters(
														
 
															
															     address TEXT,
														
 
															
															-    parent TEXT,
														
 
															
															+    id TEXT,
														
 
															
															     PRIMARY KEY (address)
														
 
															
															 );
														
--- a/src/spark/main.py
+++ b/src/spark/main.py
 
															
															         return self.spark \
														
 
															
															             .read \
														
 
															
															             .table(self.CLUSTERS_TABLE) \
														
 
															
															-            .groupBy("parent") \
														
 
															
															+            .groupBy("id") \
														
 
															
															             .agg(F.collect_set('address').alias('addresses'))
														
 
															
															     def insertNewCluster (self, addrs: Iterable[str], root: str | None = None) -> str:
														
 
															
															         if(root == None):
														
 
															
															             root = addrs[0]
														
 
															
															-        df = self.spark.createDataFrame(map(lambda addr: (addr, root), addrs), schema=['address', 'parent'])
														
 
															
															+        df = self.spark.createDataFrame(map(lambda addr: (addr, root), addrs), schema=['address', 'id'])
														
 
															
															         df.writeTo(self.CLUSTERS_TABLE).append()
														
 
															
															         return root
														
 
															
															             .zipWithIndex() \
														
 
															
															             .toDF(["tx_group", "index"])
														
 
															
															-    def rewrite_cluster_parent(self, cluster_roots: Iterable[str], new_cluster_root: str) -> None:
														
 
															
															+    def rewrite_cluster_id(self, cluster_roots: Iterable[str], new_cluster_root: str) -> None:
														
 
															
															         cluster_rewrite = self.spark \
														
 
															
															             .table(self.CLUSTERS_TABLE) \
														
 
															
															-            .where(F.col('parent').isin(cluster_roots)) \
														
 
															
															+            .where(F.col('id').isin(cluster_roots)) \
														
 
															
															             .select('address') \
														
 
															
															             .rdd \
														
 
															
															             .map(lambda addr: (addr['address'], new_cluster_root)) \
														
 
															
															-            .toDF(['address', 'parent']) \
														
 
															
															+            .toDF(['address', 'id']) \
														
 
															
															         if(debug):
														
 
															
															             print("REWRITE JOB")
														
 
															
															             print()
														
 
															
															         cluster_rewrite.writeTo(self.CLUSTERS_TABLE).append()
														
 
															
															-        
														
 
															
															-        
														
 
															
															 # end class Master
														
 
															
															 """
														
 
															
															     tuple structure:
														
 
															
															-        Row => Row(parent=addr, addresses=list[addr] | the cluster
														
 
															
															+        Row => Row(id=addr, addresses=list[addr] | the cluster
														
 
															
															         Iterable[str] => list[addr] | the transaction addresses
														
 
															
															 """
														
 
															
															 def find(data: tuple[Row, Iterable[str]]) -> str | None:
														
 
															
															     cluster = data[0]
														
 
															
															     tx = data[1]
														
 
															
															-    clusteraddresses = cluster['addresses'] + [cluster['parent']]
														
 
															
															+    clusteraddresses = cluster['addresses'] + [cluster['id']]
														
 
															
															     if any(x in tx for x in clusteraddresses):
														
 
															
															-        return cluster['parent']
														
 
															
															+        return cluster['id']
														
 
															
															     else:
														
 
															
															         return None
														
 
															
															     elif(len(matched_roots) == 1):
														
 
															
															         master.insertNewCluster(tx_addrs, matched_roots[0])
														
 
															
															     else:
														
 
															
															-        master.rewrite_cluster_parent(matched_roots[1:], matched_roots[0])
														
 
															
															+        master.rewrite_cluster_id(matched_roots[1:], matched_roots[0])
														
 
															
															         master.insertNewCluster(tx_addrs, matched_roots[0])
														
 
															
															     if(debug):
														
--- a/src/spark/main_graphs.py
+++ b/src/spark/main_graphs.py
 
															
															             .config(f"spark.sql.catalog.{config['cassandra_catalog']}", "com.datastax.spark.connector.datasource.CassandraCatalog") \
														
 
															
															             .getOrCreate()
														
 
															
															-    def empty_dataframe(self, schema) -> DataFrame:
														
 
															
															-        return self.spark.createDataFrame(self.spark.sparkContext.emptyRDD(), schema)
														
 
															
															-
														
 
															
															     def get_tx_dataframe(self) -> DataFrame:
														
 
															
															         return self.spark.table(self.TX_TABLE)
														
 
															
															     def get_cluster_dataframe(self) -> DataFrame:
														
 
															
															         return self.spark.table(self.CLUSTERS_TABLE)
														
 
															
															-# end class Master
														
 
															
															+    def write_connected_components_as_clusters(self, conn_comp: DataFrame) -> None:
														
 
															
															+        conn_comp \
														
 
															
															+            .withColumnRenamed('id', 'address') \
														
 
															
															+            .withColumnRenamed('component', 'id') \
														
 
															
															+            .writeTo(self.CLUSTERS_TABLE) \
														
 
															
															+            .append()
														
 
															
															+# end class Master
														
 
															
															 master = Master(config)
														
 
															
															-master.spark.sparkContext.setCheckpointDir(
														
 
															
															-    './checkpoints')  # spark is really adamant it needs this
														
 
															
															+master.spark.sparkContext.setCheckpointDir('./checkpoints')  # spark is really adamant it needs this even if the algorithm is set to the non-checkpointed version
														
 
															
															-# Vertex DataFrame
														
 
															
															-transaction_as_vertices = master.get_tx_dataframe() \
														
 
															
															+tx_df = master.get_tx_dataframe()
														
 
															
															+
														
 
															
															+transaction_as_vertices =  tx_df \
														
 
															
															     .select('address') \
														
 
															
															     .withColumnRenamed('address', 'id') \
														
 
															
															     .distinct()
														
 
															
															 def explode_row(row: Row) -> List[Row]:
														
 
															
															     addresses = row['addresses']
														
 
															
															-    if(len(addresses) == 1):
														
 
															
															-        return []
														
 
															
															-
														
 
															
															     return list(map(lambda addr: (addr, addresses[0]), addresses[1:]))
														
 
															
															-
														
 
															
															-tx_groups = master.get_tx_dataframe() \
														
 
															
															+transactions_as_edges = tx_df \
														
 
															
															     .groupBy("tx_id") \
														
 
															
															-    .agg(F.collect_set('address').alias('addresses'))
														
 
															
															-
														
 
															
															-transactions_as_edges = tx_groups \
														
 
															
															+    .agg(F.collect_set('address').alias('addresses')) \
														
 
															
															     .rdd \
														
 
															
															     .flatMap(explode_row) \
														
 
															
															     .toDF(['src', 'dst'])
														
 
															
															-
														
 
															
															-# Create a GraphFrame
														
 
															
															 g = GraphFrame(transaction_as_vertices, transactions_as_edges)
														
 
															
															-res = g.connectedComponents().groupBy('component').agg(F.collect_list('id')).collect()
														
 
															
															+components = g.connectedComponents(algorithm='graphframes')
														
 
															
															+
														
 
															
															+master.write_connected_components_as_clusters(components)
														
 
															
															+
														
 
															
															+if(debug):
														
 
															
															+    clusters = components \
														
 
															
															+        .groupBy('component') \
														
 
															
															+        .agg(F.collect_list('id')) \
														
 
															
															+        .collect()
														
 
															
															-for row in res:
														
 
															
															-    print(sorted(row['collect_list(id)']))
														
 
															
															+    for cluster in clusters:
														
 
															
															+        print(sorted(cluster['collect_list(id)'])) 
														
 
															
															 end = time.time()
														
 
															
															 print("ELAPSED TIME:", end-start)