Skip to content

Commit

Permalink
[529] Build xtable with scala version(s)
Browse files Browse the repository at this point in the history
  • Loading branch information
rangareddy authored and vinishjail97 committed Sep 26, 2024
1 parent 457ff33 commit 56ebbc9
Show file tree
Hide file tree
Showing 13 changed files with 107 additions and 49 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ WORKDIR /build
COPY ./ ./
RUN --mount=type=cache,target=/root/.m2 \
MAVEN_OPTS=-Dorg.slf4j.simpleLogger.defaultLogLevel=warn mvn -B package -DskipTests
RUN mv xtable-utilities/target/xtable-utilities-$(mvn help:evaluate -Dexpression=project.version -q -DforceStdout)-bundled.jar target/app.jar
RUN mv xtable-utilities/target/xtable-utilities_2.12-$(mvn help:evaluate -Dexpression=project.version -q -DforceStdout)-bundled.jar target/app.jar

FROM eclipse-temurin:17-jre-jammy AS final

Expand Down
8 changes: 6 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ future.
by something like `mvn test -Dtest=TestDeltaSync -pl xtable-core`.
4. Similarly, use `mvn clean verify` or `mvn verify` to run integration tests.

**Note:** When using Maven version 3.9 or above, Maven automatically caches the build. To ignore build caching, you can
add the `-Dmaven.build.cache.enabled=false` parameter. For example, `mvn clean package -DskipTests -Dmaven.build.cache.enabled=false`


# Style guide
1. We use [Maven Spotless plugin](https://github.com/diffplug/spotless/tree/main/plugin-maven) and
[Google java format](https://github.com/google/google-java-format) for code style.
Expand All @@ -46,7 +50,7 @@ future.

# Running the bundled jar
1. Get a pre-built bundled jar or create the jar with `mvn install -DskipTests`
2. create a yaml file that follows the format below:
2. Create a yaml file that follows the format below:
```yaml
sourceFormat: HUDI
targetFormats:
Expand Down Expand Up @@ -110,7 +114,7 @@ catalogOptions: # all other options are passed through in a map
key1: value1
key2: value2
```
5. run with `java -jar xtable-utilities/target/xtable-utilities-0.2.0-SNAPSHOT-bundled.jar --datasetConfig my_config.yaml [--hadoopConfig hdfs-site.xml] [--convertersConfig converters.yaml] [--icebergCatalogConfig catalog.yaml]`
5. Run with `java -jar xtable-utilities/target/xtable-utilities_2.12-0.2.0-SNAPSHOT-bundled.jar --datasetConfig my_config.yaml [--hadoopConfig hdfs-site.xml] [--convertersConfig converters.yaml] [--icebergCatalogConfig catalog.yaml]`
The bundled jar includes hadoop dependencies for AWS, Azure, and GCP. Sample hadoop configurations for configuring the converters
can be found in the [xtable-hadoop-defaults.xml](https://github.com/apache/incubator-xtable/blob/main/utilities/src/main/resources/xtable-hadoop-defaults.xml) file.
The custom hadoop configurations can be passed in with the `--hadoopConfig [custom-hadoop-config-file]` option.
Expand Down
2 changes: 1 addition & 1 deletion demo/start_demo.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ mvn install -am -pl xtable-core -DskipTests -T 2
mkdir -p demo/jars
cp xtable-hudi-support/xtable-hudi-support-utils/target/xtable-hudi-support-utils-0.2.0-SNAPSHOT.jar demo/jars
cp xtable-api/target/xtable-api-0.2.0-SNAPSHOT.jar demo/jars
cp xtable-core/target/xtable-core-0.2.0-SNAPSHOT.jar demo/jars
cp xtable-core/target/xtable-core_2.12-0.2.0-SNAPSHOT.jar demo/jars

cd demo
docker-compose up
77 changes: 64 additions & 13 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,13 @@

<modules>
<module>xtable-api</module>
<module>xtable-hudi-support</module>
<module>xtable-core</module>
<module>xtable-utilities</module>
<module>xtable-hudi-support</module>
</modules>

<properties>
<project.version>0.2.0-SNAPSHOT</project.version>
<maven.compiler.target>8</maven.compiler.target>
<avro.version>1.11.3</avro.version>
<log4j.version>2.22.0</log4j.version>
Expand All @@ -68,8 +69,10 @@
<maven-deploy-plugin.version>3.1.1</maven-deploy-plugin.version>
<maven-release-plugin.version>2.5.3</maven-release-plugin.version>
<parquet.version>1.12.2</parquet.version>
<scala.version>2.12.15</scala.version>
<scala.version.prefix>2.12</scala.version.prefix>
<scala12.version>2.12.20</scala12.version>
<scala13.version>2.13.14</scala13.version>
<scala.version>${scala12.version}</scala.version>
<scala.binary.version>2.12</scala.binary.version>
<spark.version>3.4.2</spark.version>
<spark.version.prefix>3.4</spark.version.prefix>
<iceberg.version>1.4.2</iceberg.version>
Expand All @@ -84,7 +87,8 @@
<delombok.output.dir>${project.build.directory}/delombok</delombok.output.dir>
<apache-jar-resource-bundle.version>1.7</apache-jar-resource-bundle.version>
<apache-incubator-disclaimer-resource-bundle.version>1.7</apache-incubator-disclaimer-resource-bundle.version>

<scala-collection-compat.version>2.8.1</scala-collection-compat.version>

<!-- Test properties -->
<skipTests>false</skipTests>
<skipUTs>${skipTests}</skipUTs>
Expand Down Expand Up @@ -126,8 +130,8 @@
</dependency>
<dependency>
<groupId>org.scala-lang.modules</groupId>
<artifactId>scala-collection-compat_${scala.version.prefix}</artifactId>
<version>2.8.1</version>
<artifactId>scala-collection-compat_${scala.binary.version}</artifactId>
<version>${scala-collection-compat.version}</version>
</dependency>

<!-- Avro -->
Expand Down Expand Up @@ -230,7 +234,7 @@
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-spark${spark.version.prefix}-bundle_${scala.version.prefix}</artifactId>
<artifactId>hudi-spark${spark.version.prefix}-bundle_${scala.binary.version}</artifactId>
<version>${hudi.version}</version>
<scope>test</scope>
</dependency>
Expand Down Expand Up @@ -266,28 +270,28 @@
</dependency>
<dependency>
<groupId>org.apache.iceberg</groupId>
<artifactId>iceberg-spark-runtime-${spark.version.prefix}_${scala.version.prefix}</artifactId>
<artifactId>iceberg-spark-runtime-${spark.version.prefix}_${scala.binary.version}</artifactId>
<version>${iceberg.version}</version>
<scope>test</scope>
</dependency>

<!-- Delta -->
<dependency>
<groupId>io.delta</groupId>
<artifactId>delta-core_${scala.version.prefix}</artifactId>
<artifactId>delta-core_${scala.binary.version}</artifactId>
<version>${delta.version}</version>
</dependency>
<dependency>
<groupId>io.delta</groupId>
<artifactId>delta-standalone_${scala.version.prefix}</artifactId>
<artifactId>delta-standalone_${scala.binary.version}</artifactId>
<version>${delta.standalone.version}</version>
<scope>test</scope>
</dependency>

<!-- Spark -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.version.prefix}</artifactId>
<artifactId>spark-core_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<exclusions>
<exclusion>
Expand All @@ -307,7 +311,7 @@
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.version.prefix}</artifactId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
Expand Down Expand Up @@ -465,7 +469,7 @@
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.module</groupId>
<artifactId>jackson-module-scala_${scala.version.prefix}</artifactId>
<artifactId>jackson-module-scala_${scala.binary.version}</artifactId>
<version>${jackson.version}</version>
</dependency>

Expand Down Expand Up @@ -868,6 +872,53 @@
</repositories>

<profiles>
<!--Scala 2.12 Profile -->
<profile>
<id>scala-2.12</id>
<activation>
<activeByDefault>true</activeByDefault>
</activation>
<properties>
<scala.version>${scala12.version}</scala.version>
<scala.binary.version>2.12</scala.binary.version>
</properties>
<build>
<pluginManagement/>
</build>
</profile>

<!--Scala 2.13 Profile -->
<!-- Once hudi supports scala 2.13 then enable following profile -->
<profile>
<id>scala-2.13</id>
<activation>
<activeByDefault>false</activeByDefault>
</activation>
<properties>
<scala.version>${scala13.version}</scala.version>
<scala.binary.version>2.13</scala.binary.version>
</properties>
<build>
<pluginManagement>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<configuration>
<args>
<arg>-unchecked</arg>
<arg>-deprecation</arg>
<arg>-feature</arg>
<arg>-explaintypes</arg>
<arg>-target:jvm-1.8</arg>
</args>
<compilerPlugins/>
</configuration>
</plugin>
</plugins>
</pluginManagement>
</build>
</profile>
<profile>
<id>release</id>
<activation>
Expand Down
4 changes: 2 additions & 2 deletions website/docs/biglake-metastore.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ This document walks through the steps to register an Apache XTable™ (Incubatin
export GOOGLE_APPLICATION_CREDENTIALS=/path/to/service_account_key.json
```
5. Clone the Apache XTable™ (Incubating) [repository](https://github.com/apache/incubator-xtable) and create the
`xtable-utilities-0.2.0-SNAPSHOT-bundled.jar` by following the steps on the [Installation page](/docs/setup)
`xtable-utilities_2.12-0.2.0-SNAPSHOT-bundled.jar` by following the steps on the [Installation page](/docs/setup)
6. Download the [BigLake Iceberg JAR](gs://spark-lib/biglake/biglake-catalog-iceberg1.2.0-0.1.0-with-dependencies.jar) locally.
Apache XTable™ (Incubating) requires the JAR to be present in the classpath.

Expand Down Expand Up @@ -117,7 +117,7 @@ catalogOptions:
From your terminal under the cloned Apache XTable™ (Incubating) directory, run the sync process using the below command.

```shell md title="shell"
java -cp xtable-utilities/target/xtable-utilities-0.2.0-SNAPSHOT-bundled.jar:/path/to/downloaded/biglake-catalog-iceberg1.2.0-0.1.0-with-dependencies.jar org.apache.xtable.utilities.RunSync --datasetConfig my_config.yaml --icebergCatalogConfig catalog.yaml
java -cp xtable-utilities/target/xtable-utilities_2.12-0.2.0-SNAPSHOT-bundled.jar:/path/to/downloaded/biglake-catalog-iceberg1.2.0-0.1.0-with-dependencies.jar org.apache.xtable.utilities.RunSync --datasetConfig my_config.yaml --icebergCatalogConfig catalog.yaml
```

:::tip Note:
Expand Down
2 changes: 1 addition & 1 deletion website/docs/fabric.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ An example hadoop configuration for authenticating to ADLS storage account is as
```

```shell md title="shell"
java -jar xtable-utilities/target/xtable-utilities-0.2.0-SNAPSHOT-bundled.jar --datasetConfig my_config.yaml --hadoopConfig hadoop.xml
java -jar xtable-utilities/target/xtable-utilities_2.12-0.2.0-SNAPSHOT-bundled.jar --datasetConfig my_config.yaml --hadoopConfig hadoop.xml
```

Running the above command will translate the table `people` in Iceberg or Hudi format to Delta Lake format. To validate
Expand Down
4 changes: 2 additions & 2 deletions website/docs/glue-catalog.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ This document walks through the steps to register an Apache XTable™ (Incubatin
also set up access credentials by following the steps
[here](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-quickstart.html)
3. Clone the Apache XTable™ (Incubating) [repository](https://github.com/apache/incubator-xtable) and create the
`xtable-utilities-0.2.0-SNAPSHOT-bundled.jar` by following the steps on the [Installation page](/docs/setup)
`xtable-utilities_2.12-0.2.0-SNAPSHOT-bundled.jar` by following the steps on the [Installation page](/docs/setup)

## Steps
### Running sync
Expand Down Expand Up @@ -84,7 +84,7 @@ Replace with appropriate values for `sourceFormat`, `tableBasePath` and `tableNa
From your terminal under the cloned xtable directory, run the sync process using the below command.

```shell md title="shell"
java -jar xtable-utilities/target/xtable-utilities-0.2.0-SNAPSHOT-bundled.jar --datasetConfig my_config.yaml
java -jar xtable-utilities/target/xtable-utilities_2.12-0.2.0-SNAPSHOT-bundled.jar --datasetConfig my_config.yaml
```

:::tip Note:
Expand Down
4 changes: 2 additions & 2 deletions website/docs/hms.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ This document walks through the steps to register an Apache XTable™ (Incubatin
or a distributed system like Amazon EMR, Google Cloud's Dataproc, Azure HDInsight etc.
This is a required step to register the table in HMS using a Spark client.
3. Clone the XTable™ (Incubating) [repository](https://github.com/apache/incubator-xtable) and create the
`xtable-utilities-0.2.0-SNAPSHOT-bundled.jar` by following the steps on the [Installation page](/docs/setup)
`xtable-utilities_2.12-0.2.0-SNAPSHOT-bundled.jar` by following the steps on the [Installation page](/docs/setup)
4. This guide also assumes that you have configured the Hive Metastore locally or on EMR/Dataproc/HDInsight
and is already running.

Expand Down Expand Up @@ -88,7 +88,7 @@ datasets:

From your terminal under the cloned Apache XTable™ (Incubating) directory, run the sync process using the below command.
```shell md title="shell"
java -jar xtable-utilities/target/xtable-utilities-0.2.0-SNAPSHOT-bundled.jar --datasetConfig my_config.yaml
java -jar xtable-utilities/target/xtable-utilities_2.12-0.2.0-SNAPSHOT-bundled.jar --datasetConfig my_config.yaml
```

:::tip Note:
Expand Down
4 changes: 2 additions & 2 deletions website/docs/how-to.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ history to enable proper point in time queries.
1. A compute instance where you can run Apache Spark. This can be your local machine, docker,
or a distributed service like Amazon EMR, Google Cloud's Dataproc, Azure HDInsight etc
2. Clone the Apache XTable™ (Incubating) [repository](https://github.com/apache/incubator-xtable) and create the
`xtable-utilities-0.2.0-SNAPSHOT-bundled.jar` by following the steps on the [Installation page](/docs/setup)
`xtable-utilities_2.12-0.2.0-SNAPSHOT-bundled.jar` by following the steps on the [Installation page](/docs/setup)
3. Optional: Setup access to write to and/or read from distributed storage services like:
* Amazon S3 by following the steps
[here](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) to install AWSCLIv2
Expand Down Expand Up @@ -351,7 +351,7 @@ Authentication for GCP requires service account credentials to be exported. i.e.
In your terminal under the cloned Apache XTable™ (Incubating) directory, run the below command.

```shell md title="shell"
java -jar xtable-utilities/target/xtable-utilities-0.2.0-SNAPSHOT-bundled.jar --datasetConfig my_config.yaml
java -jar xtable-utilities/target/xtable-utilities_2.12-0.2.0-SNAPSHOT-bundled.jar --datasetConfig my_config.yaml
```

**Optional:**
Expand Down
4 changes: 2 additions & 2 deletions website/docs/unity-catalog.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ This document walks through the steps to register an Apache XTable™ (Incubatin
3. Create a Unity Catalog metastore in Databricks as outlined [here](https://docs.gcp.databricks.com/data-governance/unity-catalog/create-metastore.html#create-a-unity-catalog-metastore).
4. Create an external location in Databricks as outlined [here](https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-ddl-create-location.html).
5. Clone the Apache XTable™ (Incubating) [repository](https://github.com/apache/incubator-xtable) and create the
`xtable-utilities-0.2.0-SNAPSHOT-bundled.jar` by following the steps on the [Installation page](/docs/setup)
`xtable-utilities_2.12-0.2.0-SNAPSHOT-bundled.jar` by following the steps on the [Installation page](/docs/setup)

## Pre-requisites (for open-source Unity Catalog)
1. Source table(s) (Hudi/Iceberg) already written to external storage locations like S3/GCS/ADLS or local.
Expand Down Expand Up @@ -48,7 +48,7 @@ datasets:
From your terminal under the cloned Apache XTable™ (Incubating) directory, run the sync process using the below command.

```shell md title="shell"
java -jar xtable-utilities/target/xtable-utilities-0.2.0-SNAPSHOT-bundled.jar --datasetConfig my_config.yaml
java -jar xtable-utilities/target/xtable-utilities_2.12-0.2.0-SNAPSHOT-bundled.jar --datasetConfig my_config.yaml
```

:::tip Note:
Expand Down
18 changes: 10 additions & 8 deletions xtable-core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,19 @@
<version>0.2.0-SNAPSHOT</version>
</parent>

<artifactId>xtable-core</artifactId>
<artifactId>xtable-core_${scala.binary.version}</artifactId>
<name>XTable Project Core</name>

<dependencies>
<dependency>
<groupId>org.apache.xtable</groupId>
<artifactId>xtable-api</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.xtable</groupId>
<artifactId>xtable-hudi-support-utils</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
Expand All @@ -47,7 +49,7 @@
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.module</groupId>
<artifactId>jackson-module-scala_${scala.version.prefix}</artifactId>
<artifactId>jackson-module-scala_${scala.binary.version}</artifactId>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
Expand All @@ -69,7 +71,7 @@
<!-- Hudi dependencies -->
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-spark${spark.version.prefix}-bundle_${scala.version.prefix}</artifactId>
<artifactId>hudi-spark${spark.version.prefix}-bundle_${scala.binary.version}</artifactId>
<scope>test</scope>
</dependency>
<dependency>
Expand All @@ -94,11 +96,11 @@
<!-- Delta dependencies -->
<dependency>
<groupId>io.delta</groupId>
<artifactId>delta-core_${scala.version.prefix}</artifactId>
<artifactId>delta-core_${scala.binary.version}</artifactId>
</dependency>
<dependency>
<groupId>io.delta</groupId>
<artifactId>delta-standalone_${scala.version.prefix}</artifactId>
<artifactId>delta-standalone_${scala.binary.version}</artifactId>
</dependency>

<!-- Hadoop dependencies -->
Expand All @@ -120,16 +122,16 @@
<!-- Spark/Iceberg/Hudi dependencies for reading/writing tables -->
<dependency>
<groupId>org.apache.iceberg</groupId>
<artifactId>iceberg-spark-runtime-${spark.version.prefix}_${scala.version.prefix}</artifactId>
<artifactId>iceberg-spark-runtime-${spark.version.prefix}_${scala.binary.version}</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.version.prefix}</artifactId>
<artifactId>spark-core_${scala.binary.version}</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.version.prefix}</artifactId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
</dependency>

<!-- Mockito -->
Expand Down
Loading

0 comments on commit 56ebbc9

Please sign in to comment.