2 years ago · 57e70a1fed
--- a/scratchpad.py
+++ b/scratchpad.py
@@ -1,5 +1,7 @@
 
				
				+import time
			
 
				
				 import sys
			
 
				
				 import json
			
 
				
				+from typing import Dict
			
 
				
				 from cassandra.cluster import Cluster
			
 
				
				 
			
 
				
				 sys.path.append("config/db")
			
@@ -14,19 +16,14 @@ session = cluster.connect(config['cassandra_keyspace'])
 
				
				 print(f"Connection OK")
			
 
				
				 
			
 
				
				 result = session.execute("SELECT * FROM clusters")
			
 
				
				-print(result.all())
			
 
				
				 
			
 
				
				+map = dict()
			
 
				
				 
			
 
				
				-"""
			
 
				
				-sc = pyspark.SparkContext('spark://osboxes:7077')
			
 
				
				+for e in result.all():
			
 
				
				+    if(e[1] not in map):
			
 
				
				+        map[e[1]] = []
			
 
				
				+    
			
 
				
				+    map[e[1]].append(e[0])
			
 
				
				 
			
 
				
				-data = sc.parallelize(list("aaa bbb cc dd e f"))
			
 
				
				-counts = data \
			
 
				
				-    .map(lambda x: (x, 1)) \
			
 
				
				-    .reduceByKey(add) \
			
 
				
				-    .sortBy(lambda x: x[1], ascending=False) \
			
 
				
				-    .collect()
			
 
				
				-
			
 
				
				-for (word, count) in counts:
			
 
				
				-    print("{}: {}".format(word, count))
			
 
				
				-"""
			
 
				
				+for key in map:
			
 
				
				+    print(sorted(map[key]))
			
--- a/src/spark/main.py
+++ b/src/spark/main.py
@@ -4,11 +4,13 @@ import json
 
				
				 from sqlite3 import Row
			
 
				
				 from typing import Iterable, List
			
 
				
				 
			
 
				
				-from pyspark import RDD
			
 
				
				-
			
 
				
				 from pyspark.sql import SparkSession, DataFrame, Row
			
 
				
				 from pyspark.sql import functions as F
			
 
				
				 
			
 
				
				+import time
			
 
				
				+start = time.time()
			
 
				
				+
			
 
				
				+
			
 
				
				 config = json.load(open("./settings.json"))
			
 
				
				 debug = config['debug']
			
 
				
				 
			
@@ -63,18 +65,22 @@ class Master:
 
				
				             .toDF(["tx_group", "index"])
			
 
				
				 
			
 
				
				     def rewrite_cluster_parent(self, cluster_roots: Iterable[str], new_cluster_root: str) -> None:
			
 
				
				-        sqlstr = f"""
			
 
				
				-            UPDATE {self.CLUSTERS_TABLE} 
			
 
				
				-            SET parent='{new_cluster_root}' 
			
 
				
				-            WHERE parent IN ({','.join(map(lambda r: f"'{r}'", cluster_roots))})"""
			
 
				
				+        cluster_rewrite = self.spark \
			
 
				
				+            .table(self.CLUSTERS_TABLE) \
			
 
				
				+            .where(F.col('parent').isin(cluster_roots)) \
			
 
				
				+            .select('address') \
			
 
				
				+            .rdd \
			
 
				
				+            .map(lambda addr: (addr['address'], new_cluster_root)) \
			
 
				
				+            .toDF(['address', 'parent']) \
			
 
				
				         
			
 
				
				         if(debug):
			
 
				
				-            print("UPDATE SQL")
			
 
				
				-            print(sqlstr)
			
 
				
				+            print("REWRITE JOB")
			
 
				
				+            cluster_rewrite.show(truncate=False, vertical=True)
			
 
				
				             print()
			
 
				
				 
			
 
				
				-        self.spark.sql(sqlstr)
			
 
				
				-
			
 
				
				+        cluster_rewrite.writeTo(self.CLUSTERS_TABLE).append()
			
 
				
				+        
			
 
				
				+        
			
 
				
				 # end class Master
			
 
				
				 
			
 
				
				 
			
@@ -104,7 +110,7 @@ for i in range(0, tx_addr_groups.count()):
 
				
				 
			
 
				
				     if(debug):
			
 
				
				         print("KNOWN CLUSTERS")
			
 
				
				-        cluster_addr_groups.show(truncate=False)
			
 
				
				+        cluster_addr_groups.show(truncate=True)
			
 
				
				         print()
			
 
				
				 
			
 
				
				     tx_addrs: Iterable[str] = tx_groups_indexed \
			
@@ -129,7 +135,7 @@ for i in range(0, tx_addr_groups.count()):
 
				
				         print("cluster_tx_mapping")
			
 
				
				         cluster_tx_mapping \
			
 
				
				             .toDF(['cluster', 'tx']) \
			
 
				
				-            .show(truncate=False)
			
 
				
				+            .show(truncate=True)
			
 
				
				         print()
			
 
				
				 
			
 
				
				 
			
@@ -145,7 +151,7 @@ for i in range(0, tx_addr_groups.count()):
 
				
				 
			
 
				
				 
			
 
				
				     if(len(matched_roots) == 0):
			
 
				
				-        new_root = master.insertNewCluster(tx_addrs)
			
 
				
				+        master.insertNewCluster(tx_addrs)
			
 
				
				     elif(len(matched_roots) == 1):
			
 
				
				         master.insertNewCluster(tx_addrs, matched_roots[0])
			
 
				
				     else:
			
@@ -153,4 +159,7 @@ for i in range(0, tx_addr_groups.count()):
 
				
				         master.insertNewCluster(tx_addrs, matched_roots[0])
			
 
				
				 
			
 
				
				     if(debug):
			
 
				
				-        print("==============")
			
 
				
				+        print("======================================================================")
			
 
				
				+
			
 
				
				+end = time.time()
			
 
				
				+print("ELAPSED TIME:", end-start)