Browse Source

working graph implementation and improved shell scripts

master
nitowa 1 year ago
parent
commit
a614991ff0

+ 4
- 0
.gitignore View File

@@ -0,0 +1,4 @@
1
+__pycache__
2
+.vscode
3
+checkpoints
4
+spark-warehouse

+ 0
- 5
.vscode/settings.json View File

@@ -1,5 +0,0 @@
1
-{
2
-    "python.analysis.extraPaths": [
3
-        "./config/db"
4
-    ]
5
-}

+ 20
- 0
README.md View File

@@ -0,0 +1,20 @@
1
+# Project Description
2
+
3
+TODO
4
+
5
+# Installation
6
+
7
+## Prerequisites:
8
+
9
+- Python3
10
+- Apache spark 3.2 (https://spark.apache.org/downloads.html)
11
+- Cassandra DB (https://cassandra.apache.org/_/index.html, locally the docker build is recommended: https://hub.docker.com/_/cassandra)
12
+
13
+For the graph implementation specifically you need to install `graphframes` manually since the official release is incompatible with `spark 3.x` (pull request pending). A prebuilt copy is supplied in the `spark-packages` directory. 
14
+- graphframes (https://github.com/eejbyfeldt/graphframes/tree/spark-3.3)
15
+
16
+## Setting up
17
+
18
+- Modify `settings.json` to reflect your setup. If you are running everything locally you can use `start_services.sh` to turn everything on in one swoop.
19
+- Load the development database by running `python3 setup.py` from the project root.
20
+- Start the spark workload by either running `submit.sh` (slow) or `submit_graph.sh` (faster)

BIN
__pycache__/settings.cpython-310.pyc View File


+ 0
- 29
scratchpad.py View File

@@ -1,29 +0,0 @@
1
-import time
2
-import sys
3
-import json
4
-from typing import Dict
5
-from cassandra.cluster import Cluster
6
-
7
-sys.path.append("config/db")
8
-
9
-config = json.load(open("./settings.json"))
10
-
11
-print(
12
-    f"Attempting Cassandra connection @ {config['cassandra_addresses']}:{config['cassandra_port']}")
13
-cluster = Cluster(config['cassandra_addresses'],
14
-                    port=config['cassandra_port'])
15
-session = cluster.connect(config['cassandra_keyspace'])
16
-print(f"Connection OK")
17
-
18
-result = session.execute("SELECT * FROM clusters")
19
-
20
-map = dict()
21
-
22
-for e in result.all():
23
-    if(e[1] not in map):
24
-        map[e[1]] = []
25
-    
26
-    map[e[1]].append(e[0])
27
-
28
-for key in map:
29
-    print(sorted(map[key]))

+ 5
- 1
settings.json View File

@@ -3,6 +3,7 @@
3 3
     "cassandra_port": 9042,
4 4
     "cassandra_keyspace": "distributedunionfind",
5 5
     "cassandra_catalog": "DUFCatalog",
6
+    "cassandra_output_consistency": "ONE",
6 7
 
7 8
     "setup_db_dir": "config/db",
8 9
     "setup_tables_dir": "config/db/tables",
@@ -11,7 +12,10 @@
11 12
     "tx_table_name": "transactions",
12 13
     "clusters_table_name": "clusters",
13 14
 
15
+    "spark_home": "/home/osboxes/Downloads/spark-3.2.2-bin-hadoop3.2",
14 16
     "spark_master": "spark://osboxes:7077",
17
+    "spark_worker_memory": "1g",
18
+    "spark_event_logging": "true",
15 19
 
16
-    "debug": true
20
+    "debug": false
17 21
 }

BIN
spark-packages/graphframe_3.3.jar View File


+ 2
- 8
src/spark/main.py View File

@@ -28,13 +28,7 @@ class Master:
28 28
     def makeSparkContext(self,config) -> SparkSession:
29 29
         return SparkSession.builder \
30 30
         .appName('SparkCassandraApp') \
31
-        .config('spark.cassandra.connection.host', ','.join(config['cassandra_addresses'])) \
32
-        .config('spark.cassandra.connection.port', config["cassandra_port"]) \
33
-        .config('spark.cassandra.output.consistency.level', 'ONE') \
34
-        .config("spark.sql.extensions",  "com.datastax.spark.connector.CassandraSparkExtensions") \
35 31
         .config(f"spark.sql.catalog.{config['cassandra_catalog']}", "com.datastax.spark.connector.datasource.CassandraCatalog") \
36
-        .config('directJoinSetting', 'on') \
37
-        .master(config['spark_master']) \
38 32
         .getOrCreate()
39 33
 
40 34
     def group_tx_addrs(self) -> DataFrame:
@@ -103,7 +97,7 @@ def find(data: tuple[Row, Iterable[str]]) -> str | None:
103 97
 master = Master(config)
104 98
 
105 99
 tx_addr_groups = master.group_tx_addrs()
106
-tx_groups_indexed = master.enumerate(tx_addr_groups)
100
+tx_groups_indexed = master.enumerate(tx_addr_groups).cache()
107 101
 
108 102
 for i in range(0, tx_addr_groups.count()):
109 103
     cluster_addr_groups = master.group_cluster_addrs()
@@ -129,7 +123,7 @@ for i in range(0, tx_addr_groups.count()):
129 123
 
130 124
     cluster_tx_mapping = cluster_addr_groups \
131 125
         .rdd \
132
-        .map(lambda cluster: (cluster, tx_addrs)) 
126
+        .map(lambda cluster: (cluster, tx_addrs))
133 127
 
134 128
     if(debug):
135 129
         print("cluster_tx_mapping")

+ 80
- 0
src/spark/main_graphs.py View File

@@ -0,0 +1,80 @@
1
+from typing import List
2
+from graphframes import GraphFrame
3
+import json
4
+from pyspark.sql import SparkSession, DataFrame, Row
5
+from pyspark.sql import functions as F
6
+
7
+import time
8
+start = time.time()
9
+
10
+
11
+config = json.load(open("./settings.json"))
12
+debug = config['debug']
13
+
14
+
15
+class Master:
16
+    spark: SparkSession
17
+    CLUSTERS_TABLE: str
18
+    TX_TABLE: str
19
+
20
+    def __init__(self, config):
21
+        self.spark = self.makeSparkContext(config)
22
+        self.config = config
23
+        self.CLUSTERS_TABLE = f"{config['cassandra_catalog']}.{config['cassandra_keyspace']}.{config['clusters_table_name']}"
24
+        self.TX_TABLE = f"{config['cassandra_catalog']}.{config['cassandra_keyspace']}.{config['tx_table_name']}"
25
+
26
+    def makeSparkContext(self, config) -> SparkSession:
27
+        return SparkSession.builder \
28
+            .appName('DistributedUnionFindWithGraphs') \
29
+            .config(f"spark.sql.catalog.{config['cassandra_catalog']}", "com.datastax.spark.connector.datasource.CassandraCatalog") \
30
+            .getOrCreate()
31
+
32
+    def empty_dataframe(self, schema) -> DataFrame:
33
+        return self.spark.createDataFrame(self.spark.sparkContext.emptyRDD(), schema)
34
+
35
+    def get_tx_dataframe(self) -> DataFrame:
36
+        return self.spark.table(self.TX_TABLE)
37
+
38
+    def get_cluster_dataframe(self) -> DataFrame:
39
+        return self.spark.table(self.CLUSTERS_TABLE)
40
+
41
+# end class Master
42
+
43
+
44
+master = Master(config)
45
+master.spark.sparkContext.setCheckpointDir(
46
+    './checkpoints')  # spark is really adamant it needs this
47
+
48
+# Vertex DataFrame
49
+transaction_as_vertices = master.get_tx_dataframe() \
50
+    .select('address') \
51
+    .withColumnRenamed('address', 'id') \
52
+    .distinct()
53
+
54
+def explode_row(row: Row) -> List[Row]:
55
+    addresses = row['addresses']
56
+    if(len(addresses) == 1):
57
+        return []
58
+
59
+    return list(map(lambda addr: (addr, addresses[0]), addresses[1:]))
60
+
61
+
62
+tx_groups = master.get_tx_dataframe() \
63
+    .groupBy("tx_id") \
64
+    .agg(F.collect_set('address').alias('addresses'))
65
+
66
+transactions_as_edges = tx_groups \
67
+    .rdd \
68
+    .flatMap(explode_row) \
69
+    .toDF(['src', 'dst'])
70
+
71
+
72
+# Create a GraphFrame
73
+g = GraphFrame(transaction_as_vertices, transactions_as_edges)
74
+res = g.connectedComponents().groupBy('component').agg(F.collect_list('id')).collect()
75
+
76
+for row in res:
77
+    print(sorted(row['collect_list(id)']))
78
+
79
+end = time.time()
80
+print("ELAPSED TIME:", end-start)

+ 6
- 3
start_services.sh View File

@@ -1,9 +1,12 @@
1
-SPARK_HOME="/home/osboxes/Downloads/spark-3.2.2-bin-hadoop3.2"
2
-SPARK_MASTER="spark://osboxes:7077"
1
+SPARK_HOME=$(python3 -c 'import json,sys;config=json.load(open("./settings.json"));print(config["spark_home"])')
2
+SPARK_MASTER=$(python3 -c 'import json,sys;config=json.load(open("./settings.json"));print(config["spark_master"])')
3 3
 
4 4
 echo "Starting spark master..."
5 5
 "$SPARK_HOME"/sbin/start-master.sh
6 6
 echo "Starting spark workers..."
7 7
 SPARK_WORKER_INSTANCES=5 "$SPARK_HOME"/sbin/start-worker.sh "$SPARK_MASTER"
8 8
 echo "Starting cassandra container..."
9
-docker run -d -p 9042:9042 cassandra
9
+docker run -d -p 9042:9042 cassandra
10
+echo "Starting spark history server..."
11
+mkdir -p /tmp/spark-events
12
+"$SPARK_HOME"/sbin/start-history-server.sh

+ 11
- 6
submit.sh View File

@@ -1,14 +1,19 @@
1
-SPARK_HOME="/home/osboxes/Downloads/spark-3.2.2-bin-hadoop3.2"
2
-MEMORY="1g"
3
-SPARK_MASTER="spark://osboxes:7077"
4
-CASSANDRA_HOST="localhost"
5
-
1
+SPARK_HOME=$(python3 -c 'import json,sys;config=json.load(open("./settings.json"));print(config["spark_home"])')
2
+MEMORY=$(python3 -c 'import json,sys;config=json.load(open("./settings.json"));print(config["spark_worker_memory"])')
3
+SPARK_MASTER=$(python3 -c 'import json,sys;config=json.load(open("./settings.json"));print(config["spark_master"])')
4
+CASSANDRA_HOST=$(python3 -c 'import json,sys;config=json.load(open("./settings.json"));print(",".join(config["cassandra_addresses"]))')
5
+CASSANDRA_PORT=$(python3 -c 'import json,sys;config=json.load(open("./settings.json"));print(config["cassandra_port"])')
6
+CASSANDRA_OUT_CONSISTENCY=$(python3 -c 'import json,sys;config=json.load(open("./settings.json"));print(config["cassandra_output_consistency"])')
7
+EVENT_LOGGING=$(python3 -c 'import json,sys;config=json.load(open("./settings.json"));print(config["spark_event_logging"])')
6 8
 
7 9
 "$SPARK_HOME"/bin/spark-submit \
8 10
 --master "$SPARK_MASTER" \
9 11
 --conf spark.executor.memory="$MEMORY" \
10 12
 --conf spark.cassandra.connection.host="$CASSANDRA_HOST" \
13
+--conf spark.cassandra.connection.port="$CASSANDRA_PORT" \
14
+--conf spark.cassandra.output.consistency.level="$CASSANDRA_OUT_CONSISTENCY" \
15
+--conf spark.eventLog.enabled="$EVENT_LOGGING" \
11 16
 --conf spark.sql.session.timeZone=UTC \
12 17
 --conf spark.sql.extensions=com.datastax.spark.connector.CassandraSparkExtensions \
13 18
 --packages com.datastax.spark:spark-cassandra-connector_2.12:3.2.0 \
14
-./src/spark/main.py
19
+./src/spark/main.py

+ 20
- 0
submit_graph.sh View File

@@ -0,0 +1,20 @@
1
+SPARK_HOME=$(python3 -c 'import json,sys;config=json.load(open("./settings.json"));print(config["spark_home"])')
2
+MEMORY=$(python3 -c 'import json,sys;config=json.load(open("./settings.json"));print(config["spark_worker_memory"])')
3
+SPARK_MASTER=$(python3 -c 'import json,sys;config=json.load(open("./settings.json"));print(config["spark_master"])')
4
+CASSANDRA_HOST=$(python3 -c 'import json,sys;config=json.load(open("./settings.json"));print(",".join(config["cassandra_addresses"]))')
5
+CASSANDRA_PORT=$(python3 -c 'import json,sys;config=json.load(open("./settings.json"));print(config["cassandra_port"])')
6
+CASSANDRA_OUT_CONSISTENCY=$(python3 -c 'import json,sys;config=json.load(open("./settings.json"));print(config["cassandra_output_consistency"])')
7
+EVENT_LOGGING=$(python3 -c 'import json,sys;config=json.load(open("./settings.json"));print(config["spark_event_logging"])')
8
+
9
+"$SPARK_HOME"/bin/spark-submit \
10
+--master "$SPARK_MASTER" \
11
+--conf spark.executor.memory="$MEMORY" \
12
+--conf spark.cassandra.connection.host="$CASSANDRA_HOST" \
13
+--conf spark.cassandra.connection.port="$CASSANDRA_PORT" \
14
+--conf spark.cassandra.output.consistency.level="$CASSANDRA_OUT_CONSISTENCY" \
15
+--conf spark.eventLog.enabled="$EVENT_LOGGING" \
16
+--conf spark.sql.session.timeZone=UTC \
17
+--conf spark.sql.extensions=com.datastax.spark.connector.CassandraSparkExtensions \
18
+--packages com.datastax.spark:spark-cassandra-connector_2.12:3.2.0 \
19
+--jars ./spark-packages/graphframe_3.3.jar \
20
+./src/spark/main_graphs.py

Loading…
Cancel
Save