před 3 roky · 57e70a1fed
--- a/scratchpad.py
+++ b/scratchpad.py
 
															
															+import time
														
 
															
															 import sys
														
 
															
															 import json
														
 
															
															+from typing import Dict
														
 
															
															 from cassandra.cluster import Cluster
														
 
															
															 sys.path.append("config/db")
														
 
															
															 print(f"Connection OK")
														
 
															
															 result = session.execute("SELECT * FROM clusters")
														
 
															
															-print(result.all())
														
 
															
															+map = dict()
														
 
															
															-"""
														
 
															
															-sc = pyspark.SparkContext('spark://osboxes:7077')
														
 
															
															+for e in result.all():
														
 
															
															+    if(e[1] not in map):
														
 
															
															+        map[e[1]] = []
														
 
															
															+    
														
 
															
															+    map[e[1]].append(e[0])
														
 
															
															-data = sc.parallelize(list("aaa bbb cc dd e f"))
														
 
															
															-counts = data \
														
 
															
															-    .map(lambda x: (x, 1)) \
														
 
															
															-    .reduceByKey(add) \
														
 
															
															-    .sortBy(lambda x: x[1], ascending=False) \
														
 
															
															-    .collect()
														
 
															
															-
														
 
															
															-for (word, count) in counts:
														
 
															
															-    print("{}: {}".format(word, count))
														
 
															
															-"""
														
 
															
															+for key in map:
														
 
															
															+    print(sorted(map[key]))
														
--- a/src/spark/main.py
+++ b/src/spark/main.py
 
															
															 from sqlite3 import Row
														
 
															
															 from typing import Iterable, List
														
 
															
															-from pyspark import RDD
														
 
															
															-
														
 
															
															 from pyspark.sql import SparkSession, DataFrame, Row
														
 
															
															 from pyspark.sql import functions as F
														
 
															
															+import time
														
 
															
															+start = time.time()
														
 
															
															+
														
 
															
															+
														
 
															
															 config = json.load(open("./settings.json"))
														
 
															
															 debug = config['debug']
														
 
															
															             .toDF(["tx_group", "index"])
														
 
															
															     def rewrite_cluster_parent(self, cluster_roots: Iterable[str], new_cluster_root: str) -> None:
														
 
															
															-        sqlstr = f"""
														
 
															
															-            UPDATE {self.CLUSTERS_TABLE} 
														
 
															
															-            SET parent='{new_cluster_root}' 
														
 
															
															-            WHERE parent IN ({','.join(map(lambda r: f"'{r}'", cluster_roots))})"""
														
 
															
															+        cluster_rewrite = self.spark \
														
 
															
															+            .table(self.CLUSTERS_TABLE) \
														
 
															
															+            .where(F.col('parent').isin(cluster_roots)) \
														
 
															
															+            .select('address') \
														
 
															
															+            .rdd \
														
 
															
															+            .map(lambda addr: (addr['address'], new_cluster_root)) \
														
 
															
															+            .toDF(['address', 'parent']) \
														
 
															
															         if(debug):
														
 
															
															-            print("UPDATE SQL")
														
 
															
															-            print(sqlstr)
														
 
															
															+            print("REWRITE JOB")
														
 
															
															+            cluster_rewrite.show(truncate=False, vertical=True)
														
 
															
															             print()
														
 
															
															-        self.spark.sql(sqlstr)
														
 
															
															-
														
 
															
															+        cluster_rewrite.writeTo(self.CLUSTERS_TABLE).append()
														
 
															
															+        
														
 
															
															+        
														
 
															
															 # end class Master
														
 
															
															     if(debug):
														
 
															
															         print("KNOWN CLUSTERS")
														
 
															
															-        cluster_addr_groups.show(truncate=False)
														
 
															
															+        cluster_addr_groups.show(truncate=True)
														
 
															
															         print()
														
 
															
															     tx_addrs: Iterable[str] = tx_groups_indexed \
														
 
															
															         print("cluster_tx_mapping")
														
 
															
															         cluster_tx_mapping \
														
 
															
															             .toDF(['cluster', 'tx']) \
														
 
															
															-            .show(truncate=False)
														
 
															
															+            .show(truncate=True)
														
 
															
															         print()
														
 
															
															     if(len(matched_roots) == 0):
														
 
															
															-        new_root = master.insertNewCluster(tx_addrs)
														
 
															
															+        master.insertNewCluster(tx_addrs)
														
 
															
															     elif(len(matched_roots) == 1):
														
 
															
															         master.insertNewCluster(tx_addrs, matched_roots[0])
														
 
															
															     else:
														
 
															
															         master.insertNewCluster(tx_addrs, matched_roots[0])
														
 
															
															     if(debug):
														
 
															
															-        print("==============")
														
 
															
															+        print("======================================================================")
														
 
															
															+
														
 
															
															+end = time.time()
														
 
															
															+print("ELAPSED TIME:", end-start)