From 74684c31056ff1f7c33d58b39ac8a4a6d2177867 Mon Sep 17 00:00:00 2001 From: Lukas Heumos Date: Tue, 9 Jul 2019 16:11:24 +0200 Subject: [PATCH 1/4] [FEATURE] #15 and some doc clarifications --- README.md | 8 ++++++++ src/main/scala/Main.scala | 14 ++++++++++---- src/main/scala/io/cli/CommandLineOptions.scala | 5 +++-- 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 299aa29..f82b5a9 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,14 @@ A query can be submitted to spark via: /opt/spark-apps/scark-cli-1.0.0.jar -s -d org.mariadb.jdbc.Driver -c /opt/spark-data/database_properties.txt -t Consequence -q "SELECT id FROM Consequence" ``` +## Loading multiple tables for JOIN operations +To load multiple use the ```-t``` option with an alias: +``` +-t "(select * from Variant INNER JOIN Variant_has_Consequence ON Variant.id = Variant_has_Consequence.Variant_id) as t" +``` + +This will load all required tables and allow for queries. + ## Tests Run tests inside the sbt console from the root project directory using: ```bash diff --git a/src/main/scala/Main.scala b/src/main/scala/Main.scala index 2e06e1a..aa48c9a 100644 --- a/src/main/scala/Main.scala +++ b/src/main/scala/Main.scala @@ -72,16 +72,22 @@ object Main { connectionProperties.put("password", s"${databaseProperties.password}") connectionProperties.put("driver", s"${commandLineParameters.databaseDriver}") - val table = spark.read.jdbc(databaseProperties.jdbcURL, commandLineParameters.table, connectionProperties) - table.printSchema() - table.show() + val tables = commandLineParameters.table + + val dfs = for { + table <- tables + } yield (table, spark.read.jdbc(databaseProperties.jdbcURL, table, connectionProperties)) // NOTE // Spark requires a View of a table to allow for SQL queries // CreateOrReplaceTempView will create a temporary view of the table in memory. // It is not persistent at this moment but you can run sql queries on top of that. // If you want to save it you can either persist or use saveAsTable to save. - table.createOrReplaceTempView(commandLineParameters.table) + for { + (name, df) <- dfs + } df.createOrReplaceTempView(name) + + for (tuple <- dfs) tuple._2.printSchema() val result = spark.sql(commandLineParameters.sqlQuery) result.show() diff --git a/src/main/scala/io/cli/CommandLineOptions.scala b/src/main/scala/io/cli/CommandLineOptions.scala index e3523b2..4ec5f01 100644 --- a/src/main/scala/io/cli/CommandLineOptions.scala +++ b/src/main/scala/io/cli/CommandLineOptions.scala @@ -24,8 +24,9 @@ class CommandLineOptions { var configFilePath = "" @Option(names = Array("-t", "--table"), - description = Array("Table to run query on. Required if using Spark.")) - var table = "" + description = Array("Table to run query on. Required if using Spark."), + arity = "1..*") + var table = Array[String]() @Option(names = Array("-q", "--query"), description = Array("SQL query to execute."), From 0960d45dc311524b0e1b893e0d6a3b06dc372ee3 Mon Sep 17 00:00:00 2001 From: Lukas Heumos Date: Tue, 9 Jul 2019 16:11:28 +0200 Subject: [PATCH 2/4] [FEATURE] #15 and some doc clarifications --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index f82b5a9..02f14c6 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ Benchmark Tool for evaluating the performance of a Spark Cluster. Run custom SQL Queries inside Spark! -s, --spark run with spark support -l, --local run spark in local mode - requires -s option to be in effect - -t, --table[=] table to execute SQL query in, mandatory if running with spark support + -t, --table[=] table to execute SQL query in, mandatory if running with spark support -d, --driver[=] driver to access Database, e.g. org.mariadb.jdbc.Driver, mandatory if running with spark support -q, --query[=] SQL query to execute -c, --config[=] @@ -54,10 +54,10 @@ A query can be submitted to spark via: /opt/spark-apps/scark-cli-1.0.0.jar -s -d org.mariadb.jdbc.Driver -c /opt/spark-data/database_properties.txt -t Consequence -q "SELECT id FROM Consequence" ``` -## Loading multiple tables for JOIN operations -To load multiple use the ```-t``` option with an alias: +## Multiple tables +Multiple views of tables can be created by specifying several tables with the ```-t``` option: ``` --t "(select * from Variant INNER JOIN Variant_has_Consequence ON Variant.id = Variant_has_Consequence.Variant_id) as t" +-t [option_1] [option_2] ... ``` This will load all required tables and allow for queries. From 08991e7ef12e0beeecdbd0ea381095302e9a1be9 Mon Sep 17 00:00:00 2001 From: Lukas Heumos Date: Tue, 16 Jul 2019 13:02:43 +0200 Subject: [PATCH 3/4] [README] Version badges and 1.1 release --- README.md | 28 +++++++++++++++++++++------- build.sbt | 2 +- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 02f14c6..c2efff2 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,9 @@ -[![Build Status](https://travis-ci.com/qbicsoftware/spark-benchmark-cli.svg?branch=development)](https://travis-ci.com/qbicsoftware/spark-benchmark-cli) +[![Build Status](https://travis-ci.com/qbicsoftware/scark-cli.svg?branch=development)](https://travis-ci.com/qbicsoftware/scark-cli) +![GitHub release](https://img.shields.io/github/release/qbicsoftware/scark-cli.svg) +![GitHub commits since latest release](https://img.shields.io/github/commits-since/qbicsoftware/scark-cli/latest.svg) [![Scala Steward badge](https://img.shields.io/badge/Scala_Steward-helping-brightgreen.svg?style=flat&logo=)](https://scala-steward.org) + # spark-benchmark-cli A tool for submitting SQL queries to a Spark Cluster. Various benchmarking statistics will be calculated. Currently MariaDB is supported out of the box. @@ -14,7 +17,7 @@ will build the fat jar. The result will be written to ```/target/$scala-version/ ## Running ```bash -java -jar scark-cli-1.0.0.jar +java -jar scark-cli-1.1.0.jar ``` ## Usage @@ -25,7 +28,7 @@ Benchmark Tool for evaluating the performance of a Spark Cluster. Run custom SQL Queries inside Spark! -s, --spark run with spark support -l, --local run spark in local mode - requires -s option to be in effect - -t, --table[=] table to execute SQL query in, mandatory if running with spark support + -t, --table[=] list of tables to execute SQL query in, mandatory if running with spark support -d, --driver[=] driver to access Database, e.g. org.mariadb.jdbc.Driver, mandatory if running with spark support -q, --query[=] SQL query to execute -c, --config[=] @@ -45,21 +48,32 @@ You can either use ```-q``` to get a prompt for your query or supply a full quer A query can be submitted to spark via: ```bash /spark/bin/spark-submit --master spark://spark-master:7077 \ -/opt/spark-apps/scark-cli-1.0.0.jar -s -d org.mariadb.jdbc.Driver -c /opt/spark-data/database_properties.txt -t
-q <"query"> +/opt/spark-apps/scark-cli-1.1.0.jar -s -d org.mariadb.jdbc.Driver -c /opt/spark-data/database_properties.txt -t
-q <"query"> ``` ## Example Query ```bash /spark/bin/spark-submit --master spark://spark-master:7077 \ -/opt/spark-apps/scark-cli-1.0.0.jar -s -d org.mariadb.jdbc.Driver -c /opt/spark-data/database_properties.txt -t Consequence -q "SELECT id FROM Consequence" +/opt/spark-apps/scark-cli-1.1.0.jar -s -d org.mariadb.jdbc.Driver -c /opt/spark-data/database_properties.txt -t Consequence -q "SELECT id FROM Consequence" ``` -## Multiple tables +## Multiple Tables Multiple views of tables can be created by specifying several tables with the ```-t``` option: ``` -t [option_1] [option_2] ... ``` +## Complex Query +``` +/spark/bin/spark-submit --master spark://spark-master:7077 \ +/opt/spark-apps/scark-cli-1.1.0.jar \ +-c /opt/spark-data/database_properties.txt \ +-s \ +-t Consequence Variant Variant_has_Consequence \ +-q "select * from Variant INNER JOIN Variant_has_Consequence ON Variant.id = Variant_has_Consequence.Variant_id INNER JOIN Consequence on Variant_has_Consequence.Consequence_id = Consequence.id" \ +-d org.mariadb.jdbc.Driver +``` + This will load all required tables and allow for queries. ## Tests @@ -68,6 +82,6 @@ Run tests inside the sbt console from the root project directory using: test ``` -## Known issues +## Known Issues Due to a bug in the MariaDB connector and Spark, mariadb in the jdbc URL has to be replaced with mysql. Please refer to: https://github.com/qbicsoftware/spark-benchmark-cli/issues/9 . diff --git a/build.sbt b/build.sbt index bcdba56..a5ad514 100644 --- a/build.sbt +++ b/build.sbt @@ -1,6 +1,6 @@ name := "scark-cli" -version := "1.0.0" +version := "1.1.0" scalaVersion := "2.12.8" From 5f98c84f8a92ed1b61c94d3d14e98f709b4677a3 Mon Sep 17 00:00:00 2001 From: Lukas Heumos Date: Tue, 16 Jul 2019 13:07:02 +0200 Subject: [PATCH 4/4] [README] Cleaned up --- README.md | 8 -------- 1 file changed, 8 deletions(-) diff --git a/README.md b/README.md index c2efff2..0af2837 100644 --- a/README.md +++ b/README.md @@ -57,12 +57,6 @@ A query can be submitted to spark via: /opt/spark-apps/scark-cli-1.1.0.jar -s -d org.mariadb.jdbc.Driver -c /opt/spark-data/database_properties.txt -t Consequence -q "SELECT id FROM Consequence" ``` -## Multiple Tables -Multiple views of tables can be created by specifying several tables with the ```-t``` option: -``` --t [option_1] [option_2] ... -``` - ## Complex Query ``` /spark/bin/spark-submit --master spark://spark-master:7077 \ @@ -74,8 +68,6 @@ Multiple views of tables can be created by specifying several tables with the `` -d org.mariadb.jdbc.Driver ``` -This will load all required tables and allow for queries. - ## Tests Run tests inside the sbt console from the root project directory using: ```bash