diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index 7ca483e434..9598feada9 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -12,3 +12,6 @@ bfe19393b17710f92b61ea80e6e5f4e026ea2676 # Scala Steward: Reformat with scalafmt 3.7.3 8a5eb7068f8d25459e6080ededa252c4000baad9 + +# Scala Steward: Reformat with scalafmt 3.7.5 +5413512ebb1e49de9bbedd32b6964c84e08366a4 diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index 415273aa30..1e8538f7d6 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -10,7 +10,7 @@ jobs: checks: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: cache SBT uses: coursier/cache-action@v6 - name: Java 11 setup @@ -24,7 +24,7 @@ jobs: scalafix-rules: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: cache SBT uses: coursier/cache-action@v6 - name: Java 11 setup @@ -37,7 +37,7 @@ jobs: dependencies: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: cache SBT uses: coursier/cache-action@v6 - name: Java 11 setup @@ -49,7 +49,7 @@ jobs: mimaReport: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: 0 - name: cache SBT diff --git a/.github/workflows/dependency-graph.yml b/.github/workflows/dependency-graph.yml index 3cbc85a4ec..f3e894869b 100644 --- a/.github/workflows/dependency-graph.yml +++ b/.github/workflows/dependency-graph.yml @@ -10,5 +10,5 @@ jobs: name: Update Dependency Graph runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: scalacenter/sbt-dependency-submission@v2 diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index bba56b3022..fcb794ebcd 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -11,7 +11,7 @@ jobs: publish-repl: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: cache SBT uses: coursier/cache-action@v6 - name: Java 11 setup @@ -30,7 +30,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout tag - uses: actions/checkout@v3 + uses: actions/checkout@v4 - uses: google-github-actions/auth@v1 with: credentials_json: ${{ secrets.GCP_CREDENTIALS }} @@ -49,7 +49,7 @@ jobs: SOCCO: true _JAVA_OPTIONS: "-Xmx1500m" - name: Deploy - uses: JamesIves/github-pages-deploy-action@v4.4.2 + uses: JamesIves/github-pages-deploy-action@v4.4.3 with: token: ${{ secrets.GITHUB_TOKEN }} branch: gh-pages diff --git a/.github/workflows/it-tests.yml b/.github/workflows/it-tests.yml index 692430362a..681548b0f1 100644 --- a/.github/workflows/it-tests.yml +++ b/.github/workflows/it-tests.yml @@ -12,7 +12,7 @@ jobs: it-test: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: google-github-actions/auth@v1 with: credentials_json: ${{ secrets.GCP_CREDENTIALS }} @@ -32,7 +32,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout current branch - uses: actions/checkout@v3 + uses: actions/checkout@v4 - uses: google-github-actions/auth@v1 with: credentials_json: ${{ secrets.GCP_CREDENTIALS }} diff --git a/.github/workflows/populate-it-data.yml b/.github/workflows/populate-it-data.yml index 0a92ba7218..7b3b8605b0 100644 --- a/.github/workflows/populate-it-data.yml +++ b/.github/workflows/populate-it-data.yml @@ -9,7 +9,7 @@ jobs: populate-integration-test-data: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: google-github-actions/auth@v1 with: credentials_json: ${{ secrets.GCP_CREDENTIALS }} diff --git a/.github/workflows/pre-release-check.yml b/.github/workflows/pre-release-check.yml index 69ef5da5ea..7123c98407 100644 --- a/.github/workflows/pre-release-check.yml +++ b/.github/workflows/pre-release-check.yml @@ -10,7 +10,7 @@ jobs: test-dataflow: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: google-github-actions/auth@v1 with: credentials_json: ${{ secrets.GCP_CREDENTIALS }} diff --git a/.github/workflows/publish-gh-site.yml b/.github/workflows/publish-gh-site.yml index f0c9ae654e..e04c8a8d1d 100644 --- a/.github/workflows/publish-gh-site.yml +++ b/.github/workflows/publish-gh-site.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout tag - uses: actions/checkout@v3 + uses: actions/checkout@v4 - uses: google-github-actions/auth@v1 with: credentials_json: ${{ secrets.GCP_CREDENTIALS }} @@ -30,7 +30,7 @@ jobs: SOCCO: true _JAVA_OPTIONS: "-Xmx1500m" - name: Deploy - uses: JamesIves/github-pages-deploy-action@v4.4.2 + uses: JamesIves/github-pages-deploy-action@v4.4.3 with: token: ${{ secrets.GITHUB_TOKEN }} branch: gh-pages diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index b731b1137d..90ee6e46d5 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -7,7 +7,7 @@ jobs: publish: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: 0 - name: cache SBT diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index c2506c6b97..d93fb71768 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -10,7 +10,7 @@ jobs: test: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: cache SBT uses: coursier/cache-action@v6 - name: Java ${{matrix.java}} setup @@ -42,7 +42,7 @@ jobs: repl-test: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: cache SBT uses: coursier/cache-action@v6 - name: Java ${{matrix.java}} setup diff --git a/.scalafmt.conf b/.scalafmt.conf index 6e20fa11f6..ba7ecf0409 100644 --- a/.scalafmt.conf +++ b/.scalafmt.conf @@ -1,4 +1,4 @@ -version = "3.7.4" +version = "3.7.14" runner.dialect = scala213source3 fileOverride { diff --git a/README.md b/README.md index e3788e2ff6..13897eb11f 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ cat wc/part-00000-of-00004.txt # Documentation -[Getting Started](https://spotify.github.io/scio/Getting-Started.html) is the best place to start with Scio. If you are new to Apache Beam and distributed data processing, check out the [Beam Programming Guide](https://beam.apache.org/documentation/programming-guide/) first for a detailed explanation of the Beam programming model and concepts. If you have experience with other Scala data processing libraries, check out this comparison between [Scio, Scalding and Spark](https://spotify.github.io/scio/Scio,-Scalding-and-Spark.html). Finally check out this document about the relationship between [Scio, Beam and Dataflow](https://spotify.github.io/scio/Scio,-Beam-and-Dataflow.html). +[Getting Started](https://spotify.github.io/scio/Getting-Started.html) is the best place to start with Scio. If you are new to Apache Beam and distributed data processing, check out the [Beam Programming Guide](https://beam.apache.org/documentation/programming-guide/) first for a detailed explanation of the Beam programming model and concepts. If you have experience with other Scala data processing libraries, check out this comparison between [Scio, Scalding and Spark](https://spotify.github.io/scio/Scio,-Scalding-and-Spark.html). Example Scio pipelines and tests can be found under [scio-examples](https://github.com/spotify/scio/tree/master/scio-examples/src). A lot of them are direct ports from Beam's Java [examples](https://github.com/apache/beam/tree/master/examples). See this [page](http://spotify.github.io/scio/examples/) for some of them with side-by-side explanation. Also see [Big Data Rosetta Code](https://github.com/spotify/big-data-rosetta-code) for common data processing code snippets in Scio, Scalding and Spark. diff --git a/build.sbt b/build.sbt index 6d504d52b9..ad6061fde6 100644 --- a/build.sbt +++ b/build.sbt @@ -29,25 +29,25 @@ import _root_.io.github.davidgregory084.DevMode ThisBuild / turbo := true val beamVendorVersion = "0.1" -val beamVersion = "2.48.0" +val beamVersion = "2.50.0" // check version used by beam -// https://github.com/apache/beam/blob/v2.48.0/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy +// https://github.com/apache/beam/blob/v2.50.0/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy val autoServiceVersion = "1.0.1" val autoValueVersion = "1.9" val avroVersion = "1.8.2" -val bigdataossVersion = "2.2.6" +val bigdataossVersion = "2.2.16" val bigtableClientVersion = "1.28.0" val commonsCodecVersion = "1.15" val commonsCompressVersion = "1.21" -val commonsIoVersion = "2.7" +val commonsIoVersion = "2.13.0" val commonsLang3Version = "3.9" val commonsMath3Version = "3.6.1" -val datastoreV1ProtoClientVersion = "2.9.0" +val datastoreV1ProtoClientVersion = "2.16.3" val flinkVersion = "1.16.0" val googleClientsVersion = "2.0.0" val googleOauthClientVersion = "1.34.1" -val guavaVersion = "31.1-jre" +val guavaVersion = "32.1.2-jre" val hadoopVersion = "2.10.2" val httpClientVersion = "4.5.13" val httpCoreVersion = "4.4.14" @@ -55,67 +55,66 @@ val jacksonVersion = "2.14.1" val javaxAnnotationApiVersion = "1.3.2" val jodaTimeVersion = "2.10.10" val nettyTcNativeVersion = "2.0.52.Final" -val nettyVersion = "4.1.77.Final" +val nettyVersion = "4.1.87.Final" val slf4jVersion = "1.7.30" -val sparkVersion = "3.1.2" +val sparkVersion = "3.4.1" val zetasketchVersion = "0.1.0" // dependent versions -val googleApiServicesBigQueryVersion = s"v2-rev20220924-$googleClientsVersion" +val googleApiServicesBigQueryVersion = s"v2-rev20230520-$googleClientsVersion" val googleApiServicesDataflowVersion = s"v1b3-rev20220920-$googleClientsVersion" val googleApiServicesPubsubVersion = s"v1-rev20220904-$googleClientsVersion" -// beam is rev20220705 but conflicts with libraries-bom -val googleApiServicesStorageVersion = s"v1-rev20230301-$googleClientsVersion" +val googleApiServicesStorageVersion = s"v1-rev20230617-$googleClientsVersion" // check versions from libraries-bom -// https://storage.googleapis.com/cloud-opensource-java-dashboard/com.google.cloud/libraries-bom/26.14.0/index.html +// https://storage.googleapis.com/cloud-opensource-java-dashboard/com.google.cloud/libraries-bom/26.22.0/index.html val animalSnifferAnnotationsVersion = "1.23" -val bigQueryStorageBetaVersion = "0.160.1" -val bigQueryStorageVersion = "2.36.1" -val checkerFrameworkVersion = "3.32.0" +val bigQueryStorageBetaVersion = "0.165.1" +val bigQueryStorageVersion = "2.41.1" +val checkerFrameworkVersion = "3.33.0" val errorProneAnnotationsVersion = "2.18.0" val failureAccessVersion = "1.0.1" val floggerVersion = "0.7.4" -val gaxHttpJsonVersion = "0.111.0" -val gaxVersion = "2.26.0" -val googleApiCommonVersion = "2.9.0" -val googleAuthVersion = "1.16.0" -val googleCloudBigTableVersion = "2.22.0" -val googleCloudCoreVersion = "2.16.0" -val googleCloudDatastoreVersion = "0.105.5" -val googleCloudMonitoringVersion = "3.17.0" -val googleCloudPubSubVersion = "1.105.11" -val googleCloudSpannerVersion = "6.41.0" -val googleCloudStorageVersion = "2.22.1" -val googleCommonsProtoVersion = "2.17.0" -val googleHttpClientsVersion = "1.43.1" -val googleIAMVersion = "1.12.0" -val grpcVersion = "1.54.0" +val gaxVersion = "2.32.0" +val googleApiCommonVersion = "2.15.0" +val googleAuthVersion = "1.19.0" +val googleCloudBigTableVersion = "2.26.0" +val googleCloudCoreVersion = "2.22.0" +val googleCloudDatastoreVersion = "0.107.3" +val googleCloudMonitoringVersion = "3.24.0" +val googleCloudPubSubVersion = "1.106.1" +val googleCloudSpannerVersion = "6.45.0" +val googleCloudStorageVersion = "2.26.0" +val googleCommonsProtoVersion = "2.23.0" +val googleHttpClientsVersion = "1.43.3" +val googleIAMVersion = "1.18.0" +val grpcVersion = "1.56.1" +val j2objcAnnotationsVersion = "2.8" val jsr305Version = "3.0.2" val opencensusVersion = "0.31.1" val perfmarkVersion = "0.26.0" -val protobufVersion = "3.21.12" +val protobufVersion = "3.23.2" -val algebirdVersion = "0.13.9" +val algebirdVersion = "0.13.10" val algebraVersion = "2.9.0" val annoy4sVersion = "0.10.0" val annoyVersion = "0.2.6" val breezeVersion = "2.1.0" val caffeineVersion = "2.9.3" -val cassandraDriverVersion = "3.11.3" -val cassandraVersion = "3.11.15" +val cassandraDriverVersion = "3.11.5" +val cassandraVersion = "3.11.16" val catsVersion = "2.9.0" val chillVersion = "0.10.0" -val circeVersion = "0.14.5" +val circeVersion = "0.14.6" val commonsTextVersion = "1.10.0" val elasticsearch7Version = "7.17.9" -val elasticsearch8Version = "8.8.1" +val elasticsearch8Version = "8.9.2" val fansiVersion = "0.4.0" val featranVersion = "0.8.0" val httpAsyncClientVersion = "4.1.5" val hamcrestVersion = "2.2" val jakartaJsonVersion = "2.1.2" val javaLshVersion = "0.12" -val jedisVersion = "4.4.3" +val jedisVersion = "4.4.4" val jnaVersion = "5.13.0" val junitInterfaceVersion = "0.13.3" val junitVersion = "4.13.2" @@ -125,7 +124,7 @@ val kryoVersion = "4.0.3" val magnoliaVersion = "1.1.3" val magnolifyVersion = "0.6.2" val metricsVersion = "3.2.6" -val neo4jDriverVersion = "4.4.11" +val neo4jDriverVersion = "4.4.12" val ndArrayVersion = "0.3.3" val parquetExtraVersion = "0.4.3" val parquetVersion = "1.12.3" @@ -134,25 +133,17 @@ val protobufGenericVersion = "0.2.9" val scalacheckVersion = "1.17.0" val scalaCollectionCompatVersion = "2.11.0" val scalaMacrosVersion = "2.1.1" -val scalatestVersion = "3.2.16" +val scalatestVersion = "3.2.17" val shapelessVersion = "2.3.10" val sparkeyVersion = "3.2.5" val tensorFlowVersion = "0.4.2" -val testContainersVersion = "0.40.17" +val testContainersVersion = "0.41.0" val zoltarVersion = "0.6.0" // dependent versions val scalatestplusVersion = s"$scalatestVersion.0" val NothingFilter: explicitdeps.ModuleFilter = { _ => false } -// to remove after beam 2.48 -// fixed in https://github.com/apache/beam/pull/25713 -val testLibs = Seq[ExclusionRule]( - "junit" % "junit", - "org.hamcrest" % "hamcrest", - "org.hamcrest" % "hamcrest-core" -) - ThisBuild / tpolecatDefaultOptionsMode := DevMode ThisBuild / tpolecatDevModeOptions ~= { opts => val excludes = Set( @@ -209,12 +200,10 @@ def previousVersion(currentVersion: String): Option[String] = { lazy val mimaSettings = Def.settings( mimaBinaryIssueFilters := Seq.empty, - // enable back after 0.13 - mimaPreviousArtifacts := Set.empty -// previousVersion(version.value) -// .filter(_ => publishArtifact.value) -// .map(organization.value % s"${normalizedName.value}_${scalaBinaryVersion.value}" % _) -// .toSet + mimaPreviousArtifacts := previousVersion(version.value) + .filter(_ => publishArtifact.value) + .map(organization.value % s"${normalizedName.value}_${scalaBinaryVersion.value}" % _) + .toSet ) lazy val formatSettings = Def.settings(scalafmtOnCompile := false, javafmtOnCompile := false) @@ -259,8 +248,14 @@ val commonSettings = formatSettings ++ Compile / doc / javacOptions := Seq("-source", "1.8"), excludeDependencies ++= Seq( "org.apache.beam" % "beam-sdks-java-io-kafka", + // logger implementation must be given by the runner lib + "ch.qos.logback" % "logback-classic", + "ch.qos.logback" % "logback-core", + "ch.qos.reload4j" % "reload4j", "org.slf4j" % "slf4j-log4j12", - "org.slf4j" % "slf4j-reload4j" + "org.slf4j" % "slf4j-reload4j", + "io.dropwizard.metrics" % "metrics-logback", + "log4j" % "log4j" ), resolvers ++= Resolver.sonatypeOssRepos("public"), fork := true, @@ -637,7 +632,6 @@ lazy val `scio-test`: Project = project "org.apache.beam" % "beam-sdks-java-core" % beamVersion % "test", "org.scalacheck" %% "scalacheck" % scalacheckVersion % "test,it", "org.scalatestplus" %% "scalacheck-1-17" % scalatestplusVersion % "test,it", - "org.slf4j" % "log4j-over-slf4j" % slf4jVersion % "test,it", "org.slf4j" % "slf4j-simple" % slf4jVersion % "test,it" ), Test / compileOrder := CompileOrder.JavaThenScala, @@ -696,7 +690,6 @@ lazy val `scio-avro`: Project = project "org.scalacheck" %% "scalacheck" % scalacheckVersion % "test,it", "org.scalatest" %% "scalatest" % scalatestVersion % "test,it", "org.scalatestplus" %% "scalacheck-1-17" % scalatestplusVersion % "test,it", - "org.slf4j" % "log4j-over-slf4j" % slf4jVersion % "test,it", "org.slf4j" % "slf4j-simple" % slf4jVersion % "test,it", "org.typelevel" %% "cats-core" % catsVersion % "test" ) @@ -758,7 +751,7 @@ lazy val `scio-google-cloud-platform`: Project = project "org.apache.avro" % "avro" % avroVersion, "org.apache.beam" % "beam-sdks-java-core" % beamVersion, "org.apache.beam" % "beam-sdks-java-extensions-google-cloud-platform-core" % beamVersion, - "org.apache.beam" % "beam-sdks-java-io-google-cloud-platform" % beamVersion excludeAll (testLibs: _*), + "org.apache.beam" % "beam-sdks-java-io-google-cloud-platform" % beamVersion, "org.apache.beam" % "beam-vendor-guava-26_0-jre" % beamVendorVersion, "org.slf4j" % "slf4j-api" % slf4jVersion, // test @@ -769,7 +762,6 @@ lazy val `scio-google-cloud-platform`: Project = project "org.scalacheck" %% "scalacheck" % scalacheckVersion % "test,it", "org.scalatest" %% "scalatest" % scalatestVersion % "test,it", "org.scalatestplus" %% "scalacheck-1-17" % scalatestplusVersion % "test,it", - "org.slf4j" % "log4j-over-slf4j" % slf4jVersion % "test,it", "org.slf4j" % "slf4j-simple" % slf4jVersion % "test,it", "org.typelevel" %% "cats-core" % catsVersion % "test" ) @@ -796,17 +788,13 @@ lazy val `scio-cassandra3`: Project = project "com.google.protobuf" % "protobuf-java" % protobufVersion, "com.twitter" % "chill-java" % chillVersion, "com.twitter" %% "chill" % chillVersion, - "org.apache.cassandra" % "cassandra-all" % cassandraVersion excludeAll ( - "ch.qos.logback" % "logback-classic", - "org.slf4j" % "log4j-over-slf4j" - ), + "org.apache.cassandra" % "cassandra-all" % cassandraVersion, "org.apache.hadoop" % "hadoop-common" % hadoopVersion, "org.apache.hadoop" % "hadoop-mapreduce-client-core" % hadoopVersion, "org.scala-lang.modules" %% "scala-collection-compat" % scalaCollectionCompatVersion, // test "org.apache.beam" % "beam-sdks-java-core" % beamVersion % "test,it", "org.scalatest" %% "scalatest" % scalatestVersion % "test,it", - "org.slf4j" % "log4j-over-slf4j" % slf4jVersion % "test,it", "org.slf4j" % "slf4j-simple" % slf4jVersion % "test,it" ) ) @@ -844,7 +832,6 @@ lazy val `scio-elasticsearch-common`: Project = project "com.fasterxml.jackson.core" % "jackson-databind" % jacksonVersion % "it", "com.fasterxml.jackson.module" %% "jackson-module-scala" % jacksonVersion % "it", "org.scalatest" %% "scalatest" % scalatestVersion % "test,it", - "org.slf4j" % "log4j-over-slf4j" % slf4jVersion % "test,it", "org.slf4j" % "slf4j-simple" % slf4jVersion % "test,it" ) ) @@ -931,7 +918,6 @@ lazy val `scio-extra`: Project = project "com.github.ben-manes.caffeine" % "caffeine" % caffeineVersion % "test,it", "org.scalacheck" %% "scalacheck" % scalacheckVersion % "test,it", "org.scalatest" %% "scalatest" % scalatestVersion % "test,it", - "org.slf4j" % "log4j-over-slf4j" % slf4jVersion % "test,it", "org.slf4j" % "slf4j-simple" % slf4jVersion % "test,it" ), Compile / sourceDirectories := (Compile / sourceDirectories).value @@ -1010,7 +996,6 @@ lazy val `scio-neo4j`: Project = project // test "com.dimafeng" %% "testcontainers-scala-neo4j" % testContainersVersion % "it", "com.dimafeng" %% "testcontainers-scala-scalatest" % testContainersVersion % "it", - "org.slf4j" % "log4j-over-slf4j" % slf4jVersion % "it", "org.slf4j" % "slf4j-simple" % slf4jVersion % "it" ) ) @@ -1065,10 +1050,11 @@ lazy val `scio-parquet`: Project = project "org.apache.parquet" % "parquet-hadoop" % parquetVersion, "org.scala-lang.modules" %% "scala-collection-compat" % scalaCollectionCompatVersion, "org.slf4j" % "slf4j-api" % slf4jVersion, + // runtime + "org.slf4j" % "log4j-over-slf4j" % slf4jVersion % Runtime, // log4j is excluded from hadoop // provided "org.tensorflow" % "tensorflow-core-api" % tensorFlowVersion % Provided, // test - "org.slf4j" % "log4j-over-slf4j" % slf4jVersion % Test, "org.slf4j" % "slf4j-simple" % slf4jVersion % Test ) ) @@ -1113,7 +1099,6 @@ lazy val `scio-tensorflow`: Project = project "com.spotify" %% "featran-scio" % featranVersion % Test, "com.spotify" %% "featran-tensorflow" % featranVersion % Test, "com.spotify" %% "magnolify-tensorflow" % magnolifyVersion % Test, - "org.slf4j" % "log4j-over-slf4j" % slf4jVersion % Test, "org.slf4j" % "slf4j-simple" % slf4jVersion % Test ) ) @@ -1176,19 +1161,17 @@ lazy val `scio-examples`: Project = project "com.spotify" %% "magnolify-tensorflow" % magnolifyVersion, "com.twitter" %% "algebird-core" % algebirdVersion, "joda-time" % "joda-time" % jodaTimeVersion, - "mysql" % "mysql-connector-java" % "8.0.33", + "com.mysql" % "mysql-connector-j" % "8.1.0", "org.apache.avro" % "avro" % avroVersion, "org.apache.beam" % "beam-sdks-java-core" % beamVersion, - "org.apache.beam" % "beam-sdks-java-extensions-google-cloud-platform-core" % beamVersion excludeAll (testLibs: _*), + "org.apache.beam" % "beam-sdks-java-extensions-google-cloud-platform-core" % beamVersion, "org.apache.beam" % "beam-sdks-java-extensions-sql" % beamVersion, - "org.apache.beam" % "beam-sdks-java-io-google-cloud-platform" % beamVersion excludeAll (testLibs: _*), + "org.apache.beam" % "beam-sdks-java-io-google-cloud-platform" % beamVersion, "org.scala-lang.modules" %% "scala-collection-compat" % scalaCollectionCompatVersion, "org.slf4j" % "slf4j-api" % slf4jVersion, // runtime "com.google.cloud.bigdataoss" % "gcs-connector" % s"hadoop2-$bigdataossVersion" % Runtime, - "com.google.cloud.sql" % "mysql-socket-factory-connector-j-8" % "1.12.0" % Runtime, - "org.slf4j" % "log4j-over-slf4j" % slf4jVersion % Runtime, - "org.slf4j" % "slf4j-simple" % slf4jVersion % Runtime, + "com.google.cloud.sql" % "mysql-socket-factory-connector-j-8" % "1.13.1" % Runtime, // test "org.scalacheck" %% "scalacheck" % scalacheckVersion % Test ), @@ -1231,14 +1214,13 @@ lazy val `scio-repl`: Project = project "org.apache.beam" % "beam-sdks-java-core" % beamVersion excludeAll ( "com.google.cloud.bigdataoss" % "gcsio" ), - "org.apache.beam" % "beam-sdks-java-extensions-google-cloud-platform-core" % beamVersion excludeAll (testLibs: _*), + "org.apache.beam" % "beam-sdks-java-extensions-google-cloud-platform-core" % beamVersion, "org.scala-lang" % "scala-compiler" % scalaVersion.value, "org.scala-lang.modules" %% "scala-collection-compat" % scalaCollectionCompatVersion, "org.slf4j" % "slf4j-api" % slf4jVersion, // runtime "org.apache.beam" % "beam-runners-direct-java" % beamVersion % Runtime, - "org.apache.beam" % "beam-runners-google-cloud-dataflow-java" % beamVersion % Runtime excludeAll (testLibs: _*), - "org.slf4j" % "log4j-over-slf4j" % slf4jVersion % Runtime, + "org.apache.beam" % "beam-runners-google-cloud-dataflow-java" % beamVersion % Runtime, "org.slf4j" % "slf4j-simple" % slf4jVersion % Runtime ), libraryDependencies ++= { @@ -1254,17 +1236,29 @@ lazy val `scio-repl`: Project = project assembly / assemblyMergeStrategy ~= { old => { case PathList("org", "apache", "beam", "sdk", "extensions", "avro", _*) => - // prefer beam-runners-direct-java until we explicitly move to beam-sdks-java-extensions-avro + // prefer beam avro classes from extensions lib instead of ones shipped in runners CustomMergeStrategy("BeamAvro") { conflicts => import sbtassembly.Assembly._ conflicts.collectFirst { - case Library(ModuleCoordinate(_, "beam-runners-direct-java", _), _, t, s) => + case Library(ModuleCoordinate(_, "beam-sdks-java-extensions-avro", _), _, t, s) => JarEntry(t, s) } match { case Some(e) => Right(Vector(e)) case None => Left("Error merging beam avro classes") } } + case PathList("org", "checkerframework", _*) => + // prefer checker-qual classes packaged in checkerframework libs + CustomMergeStrategy("CheckerQual") { conflicts => + import sbtassembly.Assembly._ + conflicts.collectFirst { + case Library(ModuleCoordinate("org.checkerframework", _, _), _, t, s) => + JarEntry(t, s) + } match { + case Some(e) => Right(Vector(e)) + case None => Left("Error merging checker-qual classes") + } + } case PathList("dev", "ludovic", "netlib", "InstanceBuilder.class") => // arbitrary pick last conflicting InstanceBuilder MergeStrategy.last @@ -1348,7 +1342,7 @@ lazy val `scio-smb`: Project = project "org.apache.beam" % "beam-sdks-java-extensions-protobuf" % beamVersion, // #3260 work around for sorter memory limit until we patch upstream // "org.apache.beam" % "beam-sdks-java-extensions-sorter" % beamVersion, - "org.apache.beam" % "beam-sdks-java-io-google-cloud-platform" % beamVersion excludeAll (testLibs: _*), + "org.apache.beam" % "beam-sdks-java-io-google-cloud-platform" % beamVersion, "org.apache.beam" % "beam-sdks-java-io-hadoop-common" % beamVersion, "org.apache.beam" % "beam-vendor-guava-26_0-jre" % beamVendorVersion, "org.apache.commons" % "commons-lang3" % commonsLang3Version, @@ -1371,7 +1365,7 @@ lazy val `scio-smb`: Project = project "org.apache.beam" % "beam-sdks-java-core" % beamVersion % "it,test" classifier "tests", "org.hamcrest" % "hamcrest" % hamcrestVersion % "it,test", "org.scalatest" %% "scalatest" % scalatestVersion % "it,test", - "org.slf4j" % "log4j-over-slf4j" % slf4jVersion % "it,test", + "org.slf4j" % "log4j-over-slf4j" % slf4jVersion % "it,test", // log4j is excluded from hadoop "org.slf4j" % "slf4j-simple" % slf4jVersion % "it,test" ), javacOptions ++= { @@ -1402,7 +1396,6 @@ lazy val `scio-redis`: Project = project "redis.clients" % "jedis" % jedisVersion, // test "org.scalatest" %% "scalatest" % scalatestVersion % Test, - "org.slf4j" % "log4j-over-slf4j" % slf4jVersion % Test, "org.slf4j" % "slf4j-simple" % slf4jVersion % Test ) ) @@ -1421,14 +1414,22 @@ lazy val site: Project = project MdocPlugin ) .dependsOn( - `scio-macros`, - `scio-core`, `scio-avro`, + `scio-cassandra3`, + `scio-core`, + `scio-elasticsearch-common`, + `scio-elasticsearch8`, + `scio-extra`, `scio-google-cloud-platform`, + `scio-grpc` % "compile->test", + `scio-jdbc`, + `scio-macros`, + `scio-neo4j`, `scio-parquet`, + `scio-redis`, `scio-smb`, - `scio-test` % "compile->test", - `scio-extra` + `scio-tensorflow`, + `scio-test` % "compile->test" ) .settings(commonSettings) .settings(macroSettings) @@ -1447,18 +1448,21 @@ lazy val site: Project = project ScalaUnidoc / siteSubdirName := "api", ScalaUnidoc / scalacOptions := Seq.empty, ScalaUnidoc / unidoc / unidocProjectFilter := inProjects( - `scio-core`, - `scio-test`, `scio-avro`, - `scio-google-cloud-platform`, `scio-cassandra3`, + `scio-core`, + `scio-elasticsearch-common`, `scio-elasticsearch8`, `scio-extra`, + `scio-google-cloud-platform`, + `scio-grpc`, `scio-jdbc`, + `scio-neo4j`, `scio-parquet`, + `scio-redis`, + `scio-smb`, `scio-tensorflow`, - `scio-macros`, - `scio-smb` + `scio-test` ), // unidoc handles class paths differently than compile and may give older // versions high precedence. @@ -1474,19 +1478,26 @@ lazy val site: Project = project // paradox paradox / sourceManaged := mdocOut.value, paradoxProperties ++= Map( + "extref.example.base_url" -> "https://spotify.github.io/scio/examples/%s.scala.html", + "github.base_url" -> "https://github.com/spotify/scio", + "javadoc.com.google.api.services.bigquery.base_url" -> "https://developers.google.com/resources/api-libraries/documentation/bigquery/v2/java/latest/", + "javadoc.com.google.common.hash.base_url" -> s"https://guava.dev/releases/$guavaVersion/api/docs", "javadoc.com.spotify.scio.base_url" -> "http://spotify.github.com/scio/api", - "javadoc.org.apache.beam.sdk.extensions.smb.base_url" -> - "https://spotify.github.io/scio/api/org/apache/beam/sdk/extensions/smb", + "javadoc.org.apache.avro.base_url" -> "https://avro.apache.org/docs/current/api/java/", "javadoc.org.apache.beam.base_url" -> s"https://beam.apache.org/releases/javadoc/$beamVersion", + "javadoc.org.apache.beam.sdk.extensions.smb.base_url" -> "https://spotify.github.io/scio/api/org/apache/beam/sdk/extensions/smb", + "javadoc.org.joda.time.base_url" -> "https://www.joda.org/joda-time/apidocs", + "javadoc.org.tensorflow.base_url" -> "https://www.tensorflow.org/jvm/api_docs/java/", + "javadoc.org.tensorflow.link_style" -> "direct", "scaladoc.com.spotify.scio.base_url" -> "https://spotify.github.io/scio/api", - "github.base_url" -> "https://github.com/spotify/scio", - "extref.example.base_url" -> "https://spotify.github.io/scio/examples/%s.scala.html" + "scaladoc.com.twitter.algebird.base_url" -> "https://twitter.github.io/algebird/api/", + "scaladoc.kantan.base_url" -> "https://nrinaudo.github.io/kantan.csv/api" ), Compile / paradoxMaterialTheme := ParadoxMaterialTheme() .withFavicon("images/favicon.ico") .withColor("white", "indigo") .withLogo("images/logo.png") - .withCopyright("Copyright (C) 2020 Spotify AB") + .withCopyright("Copyright (C) 2023 Spotify AB") .withRepository(uri("https://github.com/spotify/scio")) .withSocial(uri("https://github.com/spotify"), uri("https://twitter.com/spotifyeng")), // sbt-site @@ -1505,7 +1516,7 @@ lazy val soccoSettings = if (sys.env.contains("SOCCO")) { "-P:socco:package_com.spotify.scio:https://spotify.github.io/scio/api" ), autoCompilerPlugins := true, - addCompilerPlugin(("io.regadas" %% "socco-ng" % "0.1.7").cross(CrossVersion.full)), + addCompilerPlugin(("io.regadas" %% "socco-ng" % "0.1.9").cross(CrossVersion.full)), // Generate scio-examples/target/site/index.html soccoIndex := SoccoIndex.generate(target.value / "site" / "index.html"), Compile / compile := { @@ -1528,7 +1539,7 @@ ThisBuild / dependencyOverrides ++= Seq( "com.google.api" % "api-common" % googleApiCommonVersion, "com.google.api" % "gax" % gaxVersion, "com.google.api" % "gax-grpc" % gaxVersion, - "com.google.api" % "gax-httpjson" % gaxHttpJsonVersion, + "com.google.api" % "gax-httpjson" % gaxVersion, "com.google.api-client" % "google-api-client" % googleClientsVersion, "com.google.api.grpc" % "grpc-google-common-protos" % googleCommonsProtoVersion, "com.google.api.grpc" % "proto-google-cloud-bigtable-admin-v2" % googleCloudBigTableVersion, @@ -1554,6 +1565,7 @@ ThisBuild / dependencyOverrides ++= Seq( "com.google.http-client" % "google-http-client-gson" % googleHttpClientsVersion, "com.google.http-client" % "google-http-client-jackson2" % googleHttpClientsVersion, "com.google.http-client" % "google-http-client-protobuf" % googleHttpClientsVersion, + "com.google.j2objc" % "j2objc-annotations" % j2objcAnnotationsVersion, "com.google.protobuf" % "protobuf-java" % protobufVersion, "com.google.protobuf" % "protobuf-java-util" % protobufVersion, "commons-codec" % "commons-codec" % commonsCodecVersion, diff --git a/project/build.properties b/project/build.properties index 40b3b8e7b6..3040987151 100644 --- a/project/build.properties +++ b/project/build.properties @@ -1 +1 @@ -sbt.version=1.9.0 +sbt.version=1.9.4 diff --git a/project/plugins.sbt b/project/plugins.sbt index 125598d1a6..f8863c1eb9 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -1,27 +1,27 @@ addDependencyTreePlugin -addSbtPlugin("ch.epfl.scala" % "sbt-bloop" % "1.5.6") +addSbtPlugin("ch.epfl.scala" % "sbt-bloop" % "1.5.11") addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.11.0") addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.1.1") addSbtPlugin("com.eed3si9n" % "sbt-buildinfo" % "0.11.0") addSbtPlugin("com.github.cb372" % "sbt-explicit-dependencies" % "0.3.1") -addSbtPlugin("com.github.sbt" % "sbt-avro" % "3.4.2") +addSbtPlugin("com.github.sbt" % "sbt-avro" % "3.4.3") addSbtPlugin("com.github.sbt" % "sbt-ci-release" % "1.5.12") addSbtPlugin("com.github.sbt" % "sbt-unidoc" % "0.5.0") -addSbtPlugin("com.lightbend.paradox" % "sbt-paradox" % "0.10.3") +addSbtPlugin("com.lightbend.paradox" % "sbt-paradox" % "0.10.5") addSbtPlugin("com.lightbend.sbt" % "sbt-java-formatter" % "0.8.0") addSbtPlugin("com.thesamet" % "sbt-protoc" % "1.0.6") addSbtPlugin("com.thoughtworks.sbt-api-mappings" % "sbt-api-mappings" % "3.0.2") -addSbtPlugin("com.typesafe" % "sbt-mima-plugin" % "1.1.2") +addSbtPlugin("com.typesafe" % "sbt-mima-plugin" % "1.1.3") addSbtPlugin("com.github.sbt" % "sbt-ghpages" % "0.8.0") addSbtPlugin("com.github.sbt" % "sbt-site" % "1.5.0") addSbtPlugin("com.github.sbt" % "sbt-site-paradox" % "1.5.0") addSbtPlugin("de.heikoseeberger" % "sbt-header" % "5.10.0") -addSbtPlugin("io.github.davidgregory084" % "sbt-tpolecat" % "0.4.2") +addSbtPlugin("io.github.davidgregory084" % "sbt-tpolecat" % "0.4.4") addSbtPlugin("io.github.jonas" % "sbt-paradox-material-theme" % "0.6.0") addSbtPlugin("org.scalameta" % "sbt-mdoc" % "2.3.7") -addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.5.0") -addSbtPlugin("org.scoverage" % "sbt-scoverage" % "2.0.8") -addSbtPlugin("pl.project13.scala" % "sbt-jmh" % "0.4.5") +addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.5.2") +addSbtPlugin("org.scoverage" % "sbt-scoverage" % "2.0.9") +addSbtPlugin("pl.project13.scala" % "sbt-jmh" % "0.4.6") libraryDependencies += "org.apache.avro" % "avro-compiler" % "1.8.2" diff --git a/scalafix/build.sbt b/scalafix/build.sbt index ea28ebb6cd..031c0d3b98 100644 --- a/scalafix/build.sbt +++ b/scalafix/build.sbt @@ -41,7 +41,7 @@ lazy val root = project `output-0_13`, // scalafix rules - ): _*, + ): _* ) lazy val rules = project @@ -56,6 +56,7 @@ def scio(version: String): List[ModuleID] = { val modules = List( "scio-core", "scio-avro", + "scio-parquet", "scio-test", "scio-jdbc", "scio-tensorflow" diff --git a/scalafix/input-0_13/src/main/scala/fix/v0_13_0/FixTaps.scala b/scalafix/input-0_13/src/main/scala/fix/v0_13_0/FixTaps.scala new file mode 100644 index 0000000000..40db71b431 --- /dev/null +++ b/scalafix/input-0_13/src/main/scala/fix/v0_13_0/FixTaps.scala @@ -0,0 +1,32 @@ +/* +rule = FixTaps +*/ +package fix.v0_13_0 + +import com.spotify.scio.avro.{GenericRecordParseTap, GenericRecordTap, ObjectFileTap, SpecificRecordTap} +import com.spotify.scio.io.TextTap +import com.spotify.scio.parquet.types.ParquetTypeIO +import com.spotify.scio.tensorflow.TFRecordFileTap +import org.apache.avro.Schema +import org.apache.avro.generic.GenericRecord +import org.apache.avro.specific.SpecificRecord + +object FixTaps { + type T = SpecificRecord + val path: String = ??? + SpecificRecordTap[T](path) + + ObjectFileTap[T](path) + + val schema: Schema = ??? + GenericRecordTap(path, schema) + + val parseFn: GenericRecord => T = ??? + GenericRecordParseTap(path, parseFn) + + TextTap(path) + + ParquetTypeIO.ReadParam[T]() + + TFRecordFileTap(path) +} diff --git a/scalafix/input-0_13/src/main/scala/fix/v0_13_0/FixTfParameter.scala b/scalafix/input-0_13/src/main/scala/fix/v0_13_0/FixTfParameter.scala new file mode 100644 index 0000000000..8534c8e3c3 --- /dev/null +++ b/scalafix/input-0_13/src/main/scala/fix/v0_13_0/FixTfParameter.scala @@ -0,0 +1,25 @@ +/* +rule = FixTfParameter + */ +package fix.v0_13_0 + +import com.spotify.scio.values.SCollection +import com.spotify.scio.tensorflow._ +import com.spotify.zoltar.tf.TensorFlowModel +import org.tensorflow._ + +object FixTfParameter { + case class A() + case class B() + case class C() + + def toTensors(a: A): Map[String, Tensor] = ??? + def fromTensors(a: A, tensors: Map[String, Tensor]): B = ??? + + val elements: SCollection[A] = ??? + val options: TensorFlowModel.Options = ??? + val fetchOpts: Seq[String] = ??? + + val result: SCollection[B] = elements.predict[B, C]("gs://model-path", fetchOpts, options)(toTensors)(fromTensors) + val b: SCollection[B] = elements.predictWithSigDef[B, C]("gs://model-path", options)(toTensors)(fromTensors _) +} diff --git a/scalafix/output-0_13/src/main/scala/fix/v0_13_0/FixTaps.scala b/scalafix/output-0_13/src/main/scala/fix/v0_13_0/FixTaps.scala new file mode 100644 index 0000000000..6d990346b6 --- /dev/null +++ b/scalafix/output-0_13/src/main/scala/fix/v0_13_0/FixTaps.scala @@ -0,0 +1,32 @@ +package fix.v0_13_0 + +import com.spotify.scio.avro.{GenericRecordParseTap, GenericRecordTap, ObjectFileTap, SpecificRecordTap} +import com.spotify.scio.io.TextTap +import com.spotify.scio.parquet.types.ParquetTypeIO +import com.spotify.scio.tensorflow.TFRecordFileTap +import org.apache.avro.Schema +import org.apache.avro.generic.GenericRecord +import org.apache.avro.specific.SpecificRecord +import com.spotify.scio.avro.AvroIO +import com.spotify.scio.io.TextIO +import com.spotify.scio.tensorflow.TFRecordIO + +object FixTaps { + type T = SpecificRecord + val path: String = ??? + SpecificRecordTap[T](path, AvroIO.ReadParam()) + + ObjectFileTap[T](path, AvroIO.ReadParam()) + + val schema: Schema = ??? + GenericRecordTap(path, schema, AvroIO.ReadParam()) + + val parseFn: GenericRecord => T = ??? + GenericRecordParseTap(path, parseFn, AvroIO.ReadParam()) + + TextTap(path, TextIO.ReadParam()) + + ParquetTypeIO.ReadParam() + + TFRecordFileTap(path, TFRecordIO.ReadParam()) +} diff --git a/scalafix/output-0_13/src/main/scala/fix/v0_13_0/FixTfParameter.scala b/scalafix/output-0_13/src/main/scala/fix/v0_13_0/FixTfParameter.scala new file mode 100644 index 0000000000..94de860c3c --- /dev/null +++ b/scalafix/output-0_13/src/main/scala/fix/v0_13_0/FixTfParameter.scala @@ -0,0 +1,22 @@ +package fix.v0_13_0 + +import com.spotify.scio.values.SCollection +import com.spotify.scio.tensorflow._ +import com.spotify.zoltar.tf.TensorFlowModel +import org.tensorflow._ + +object FixTfParameter { + case class A() + case class B() + case class C() + + def toTensors(a: A): Map[String, Tensor] = ??? + def fromTensors(a: A, tensors: Map[String, Tensor]): B = ??? + + val elements: SCollection[A] = ??? + val options: TensorFlowModel.Options = ??? + val fetchOpts: Seq[String] = ??? + + val result: SCollection[B] = elements.predict[B]("gs://model-path", fetchOpts, options)(toTensors)(fromTensors) + val b: SCollection[B] = elements.predictWithSigDef[B]("gs://model-path", options)(toTensors)(fromTensors _) +} diff --git a/scalafix/project/Scio.scala b/scalafix/project/Scio.scala index 4e2777a37c..9dfb9bf419 100644 --- a/scalafix/project/Scio.scala +++ b/scalafix/project/Scio.scala @@ -6,5 +6,5 @@ object Scio { val `0.10` = "0.10.4" val `0.11` = "0.11.9" val `0.12` = "0.12.0" - val `0.13` = "0.13.0-RC1" // TODO + val `0.13` = "0.13.0" } diff --git a/scalafix/project/build.properties b/scalafix/project/build.properties index 40b3b8e7b6..3040987151 100644 --- a/scalafix/project/build.properties +++ b/scalafix/project/build.properties @@ -1 +1 @@ -sbt.version=1.9.0 +sbt.version=1.9.4 diff --git a/scalafix/project/plugins.sbt b/scalafix/project/plugins.sbt index 8ad86a3de6..500ee37615 100644 --- a/scalafix/project/plugins.sbt +++ b/scalafix/project/plugins.sbt @@ -1,4 +1,4 @@ resolvers += Resolver.sonatypeRepo("releases") addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.11.0") addSbtPlugin("com.eed3si9n" % "sbt-projectmatrix" % "0.9.0") -addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.5.0") +addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.5.2") diff --git a/scalafix/rules/src/main/resources/META-INF/services/scalafix.v1.Rule b/scalafix/rules/src/main/resources/META-INF/services/scalafix.v1.Rule index 3a9fefb060..38c88b896b 100644 --- a/scalafix/rules/src/main/resources/META-INF/services/scalafix.v1.Rule +++ b/scalafix/rules/src/main/resources/META-INF/services/scalafix.v1.Rule @@ -12,4 +12,6 @@ fix.v0_8_0.ConsistenceJoinNames fix.v0_10_0.FixCoderPropagation fix.v0_12_0.FixBqSaveAsTable fix.v0_12_0.FixPubsubSpecializations -fix.v0_13_0.FixSkewedJoins \ No newline at end of file +fix.v0_13_0.FixSkewedJoins +fix.v0_13_0.FixTfParameter +fix.v0_13_0.FixTaps \ No newline at end of file diff --git a/scalafix/rules/src/main/scala/fix/v0_13_0/FixSkewedJoins.scala b/scalafix/rules/src/main/scala/fix/v0_13_0/FixSkewedJoins.scala index bb807ecdc2..79fbc1eab3 100644 --- a/scalafix/rules/src/main/scala/fix/v0_13_0/FixSkewedJoins.scala +++ b/scalafix/rules/src/main/scala/fix/v0_13_0/FixSkewedJoins.scala @@ -7,12 +7,13 @@ import scala.meta.contrib._ object FixSkewedJoins { - private val PairSkewedSCollectionFunctions = "com/spotify/scio/values/PairSkewedSCollectionFunctions" + private val PairSkewedSCollectionFunctions = + "com/spotify/scio/values/PairSkewedSCollectionFunctions" val SkewedJoins: SymbolMatcher = SymbolMatcher.normalized(PairSkewedSCollectionFunctions + "#skewedJoin") + - SymbolMatcher.normalized(PairSkewedSCollectionFunctions + "#skewedLeftOuterJoin") + - SymbolMatcher.normalized(PairSkewedSCollectionFunctions + "#skewedFullOuterJoin") + SymbolMatcher.normalized(PairSkewedSCollectionFunctions + "#skewedLeftOuterJoin") + + SymbolMatcher.normalized(PairSkewedSCollectionFunctions + "#skewedFullOuterJoin") val HotKeyMethodImport = importer"com.spotify.scio.values.HotKeyMethod" @@ -44,12 +45,12 @@ class FixSkewedJoins extends SemanticRule("FixSkewedJoins") { private def isOldSkewedJoinApi(fn: Term)(implicit doc: SemanticDocument): Boolean = { val symbol = fn.symbol SkewedJoins.matches(symbol) && (symbol.info.get.signature match { - case MethodSignature(_, parameterLists, _) => parameterLists.flatten.map(_.symbol.displayName) == OldParameters + case MethodSignature(_, parameterLists, _) => + parameterLists.flatten.map(_.symbol.displayName) == OldParameters case _ => false }) } - private def findParam(param: Term.Name, pos: Int)(args: List[Term]): Option[Term] = { args .collectFirst { @@ -64,14 +65,19 @@ class FixSkewedJoins extends SemanticRule("FixSkewedJoins") { doc.tree.collect { case t @ q"$fn(..$params)" if isOldSkewedJoinApi(fn) => val rhs = findParam(ParamRhs, 0)(params).map(p => q"rhs = $p") - val hotKeyMethod = findParam(ParamHotKeyThreshold, 1)(params).map(p => q"hotKeyMethod = HotKeyMethod.Threshold($p)") + val hotKeyMethod = findParam(ParamHotKeyThreshold, 1)(params).map(p => + q"hotKeyMethod = HotKeyMethod.Threshold($p)" + ) val cmsEps = findParam(ParamEps, 2)(params).map(p => q"cmsEps = $p") val cmsDelta = findParam(ParamDelta, 4)(params).map(p => q"cmsDelta = $p") val cmsSeed = findParam(ParamSeed, 3)(params).map(p => q"cmsSeed = $p") - val sampleFraction = findParam(ParamSampleFraction, 5)(params).map(p => q"sampleFraction = $p") - val sampleWithReplacement = findParam(ParamWithReplacement, 6)(params).map(p => q"sampleWithReplacement = $p") + val sampleFraction = + findParam(ParamSampleFraction, 5)(params).map(p => q"sampleFraction = $p") + val sampleWithReplacement = + findParam(ParamWithReplacement, 6)(params).map(p => q"sampleWithReplacement = $p") - val updated = (rhs ++ hotKeyMethod ++ cmsEps ++ cmsDelta ++ cmsSeed ++ sampleFraction ++ sampleWithReplacement).toList + val updated = + (rhs ++ hotKeyMethod ++ cmsEps ++ cmsDelta ++ cmsSeed ++ sampleFraction ++ sampleWithReplacement).toList Patch.addGlobalImport(HotKeyMethodImport) + Patch.replaceTree(t, q"$fn(..$updated)".syntax) }.asPatch } diff --git a/scalafix/rules/src/main/scala/fix/v0_13_0/FixTaps.scala b/scalafix/rules/src/main/scala/fix/v0_13_0/FixTaps.scala new file mode 100644 index 0000000000..24db5c20b5 --- /dev/null +++ b/scalafix/rules/src/main/scala/fix/v0_13_0/FixTaps.scala @@ -0,0 +1,60 @@ +package fix.v0_13_0 + +import scalafix.v1._ + +import scala.meta._ + +object FixTaps { + private val scio = "com.spotify.scio." + private val avro = scio + "avro." + + val ParquetParam = SymbolMatcher.normalized(scio + "parquet.types.ParquetTypeIO.ReadParam") + + val TextTap = SymbolMatcher.normalized(scio + "io.TextTap") + val TFTap = SymbolMatcher.normalized(scio + "tensorflow.TFRecordFileTap") + val AvroTaps = SymbolMatcher.normalized(avro + "SpecificRecordTap") + + SymbolMatcher.normalized(avro + "ObjectFileTap") + val GRTaps = SymbolMatcher.normalized(avro + "GenericRecordTap") + + SymbolMatcher.normalized(avro + "GenericRecordParseTap") + + val AvroIOImport = importer"com.spotify.scio.avro.AvroIO" + val TextIOImport = importer"com.spotify.scio.io.TextIO" + val TFRecordIOImport = importer"com.spotify.scio.tensorflow.TFRecordIO" + + def addReadParam(t: Tree, clazz: Term, optTParam: Option[Type], params: Seq[Term], repl: Term) = { + val updated = params.toList ++ List(repl) + optTParam match { + case None => Patch.replaceTree(t, q"$clazz(..$updated)".syntax) + case Some(tParam) => Patch.replaceTree(t, q"$clazz[$tParam](..$updated)".syntax) + } + } +} + +class FixTaps extends SemanticRule("FixTaps") { + import FixTaps._ + + override def fix(implicit doc: SemanticDocument): Patch = { + doc.tree.collect { + case t @ q"$clazz[$tParam]" if ParquetParam.matches(clazz.symbol) => + // drop type parameter for ParquetTypeIO.ReadParam + Patch.replaceTree(t, q"$clazz".syntax) + case t @ q"$clazz[$tParam](..$params)" + if AvroTaps.matches(clazz.symbol) && params.length == 1 => + addReadParam(t, clazz, Some(tParam), params, q"AvroIO.ReadParam()") + + Patch.addGlobalImport(AvroIOImport) + case t @ q"$clazz(..$params)" => + if (GRTaps.matches(clazz.symbol) && params.length == 2) { + addReadParam(t, clazz, None, params, q"AvroIO.ReadParam()") + + Patch.addGlobalImport(AvroIOImport) + } else if (TextTap.matches(clazz.symbol) && params.length == 1) { + addReadParam(t, clazz, None, params, q"TextIO.ReadParam()") + + Patch.addGlobalImport(TextIOImport) + } else if (TFTap.matches(clazz.symbol) && params.length == 1) { + addReadParam(t, clazz, None, params, q"TFRecordIO.ReadParam()") + + Patch.addGlobalImport(TFRecordIOImport) + } else { + Patch.empty + } + }.asPatch + } +} diff --git a/scalafix/rules/src/main/scala/fix/v0_13_0/FixTfParameter.scala b/scalafix/rules/src/main/scala/fix/v0_13_0/FixTfParameter.scala new file mode 100644 index 0000000000..9f78c0d8a1 --- /dev/null +++ b/scalafix/rules/src/main/scala/fix/v0_13_0/FixTfParameter.scala @@ -0,0 +1,24 @@ +package fix.v0_13_0 + +import scalafix.v1.{MethodSignature, _} + +import scala.meta._ +import scala.meta.contrib._ + +object FixTfParameter { + private val Ops = "com/spotify/scio/tensorflow/syntax/PredictSCollectionOps" + val PredictMatcher: SymbolMatcher = SymbolMatcher.normalized(Ops + "#predict") + + SymbolMatcher.normalized(Ops + "#predictWithSigDef") +} + +class FixTfParameter extends SemanticRule("FixTfParameter") { + import FixTfParameter._ + + override def fix(implicit doc: SemanticDocument): Patch = { + doc.tree.collect { + case t @ q"$fn[..$tParams]" if PredictMatcher.matches(fn.symbol) && tParams.length == 2 => + val newTParam = tParams.toList.head + Patch.replaceTree(t, q"$fn[$newTParam]".syntax) + }.asPatch + } +} diff --git a/scalafix/rules/src/main/scala/fix/v0_8_0/FixBigQueryDeprecations.scala b/scalafix/rules/src/main/scala/fix/v0_8_0/FixBigQueryDeprecations.scala index 14dd6a3589..63d3c9246c 100644 --- a/scalafix/rules/src/main/scala/fix/v0_8_0/FixBigQueryDeprecations.scala +++ b/scalafix/rules/src/main/scala/fix/v0_8_0/FixBigQueryDeprecations.scala @@ -24,4 +24,4 @@ final class FixBigQueryDeprecations extends SemanticRule("FixBigQueryDeprecation Patch.replaceTree(head, q"Table.Ref($head)".syntax) } }.asPatch -} \ No newline at end of file +} diff --git a/scio-avro/src/main/scala/com/spotify/scio/avro/AvroDatumFactory.scala b/scio-avro/src/main/scala/com/spotify/scio/avro/AvroDatumFactory.scala new file mode 100644 index 0000000000..ea893f29cd --- /dev/null +++ b/scio-avro/src/main/scala/com/spotify/scio/avro/AvroDatumFactory.scala @@ -0,0 +1,42 @@ +/* + * Copyright 2023 Spotify AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.spotify.scio.avro + +import org.apache.avro.Schema +import org.apache.avro.io.{DatumReader, DatumWriter} +import org.apache.avro.reflect.{ReflectData, ReflectDatumReader, ReflectDatumWriter} +import org.apache.beam.sdk.extensions.avro.io.AvroDatumFactory +import org.apache.beam.sdk.extensions.avro.schemas.utils.AvroUtils + +/** + * Custom AvroDatumFactory for avro AvroDatumFactory relying on avro reflect so that underlying + * CharSequence type is String + */ +private[scio] class SpecificRecordDatumFactory[T](recordType: Class[T]) + extends AvroDatumFactory[T](recordType) { + override def apply(writer: Schema, reader: Schema): DatumReader[T] = { + val data = new ReflectData(recordType.getClassLoader) + AvroUtils.addLogicalTypeConversions(data) + new ReflectDatumReader[T](writer, reader, data) + } + + override def apply(writer: Schema): DatumWriter[T] = { + val data = new ReflectData(recordType.getClassLoader) + AvroUtils.addLogicalTypeConversions(data) + new ReflectDatumWriter[T](writer, data) + } +} diff --git a/scio-avro/src/main/scala/com/spotify/scio/avro/AvroIO.scala b/scio-avro/src/main/scala/com/spotify/scio/avro/AvroIO.scala index 2cbab6327d..08a678d680 100644 --- a/scio-avro/src/main/scala/com/spotify/scio/avro/AvroIO.scala +++ b/scio-avro/src/main/scala/com/spotify/scio/avro/AvroIO.scala @@ -183,6 +183,7 @@ final case class SpecificRecordIO[T <: SpecificRecord: ClassTag: Coder](path: St val t = BAvroIO .read(cls) .from(filePattern) + .withDatumReaderFactory(new SpecificRecordDatumFactory[T](cls)) sc .applyTransform(t) .setCoder(coder) @@ -194,7 +195,9 @@ final case class SpecificRecordIO[T <: SpecificRecord: ClassTag: Coder](path: St */ override protected def write(data: SCollection[T], params: WriteP): Tap[T] = { val cls = ScioUtil.classOf[T] - val t = BAvroIO.write(cls) + val t = BAvroIO + .write(cls) + .withDatumWriterFactory(new SpecificRecordDatumFactory[T](cls)) data.applyInternal( avroOut( diff --git a/scio-core/src/main/scala/com/spotify/scio/coders/BeamCoders.scala b/scio-core/src/main/scala/com/spotify/scio/coders/BeamCoders.scala index 3cfd7369d7..528ff94e0d 100644 --- a/scio-core/src/main/scala/com/spotify/scio/coders/BeamCoders.scala +++ b/scio-core/src/main/scala/com/spotify/scio/coders/BeamCoders.scala @@ -19,7 +19,7 @@ package com.spotify.scio.coders import com.spotify.scio.coders.CoderMaterializer.CoderOptions import com.spotify.scio.values.SCollection -import org.apache.beam.sdk.coders.{Coder => BCoder, NullableCoder, StructuredCoder} +import org.apache.beam.sdk.coders.{Coder => BCoder, NullableCoder} import org.apache.beam.sdk.values.PCollection import scala.annotation.tailrec @@ -49,8 +49,7 @@ private[scio] object BeamCoders { val options = CoderOptions(coll.context.options) val coder = coll.internal.getCoder Some(unwrap(options, coder)) - .collect { case c: StructuredCoder[_] => c } - .map(_.getComponents.asScala.toList) + .map(_.getCoderArguments.asScala.toList) .collect { case (c1: BCoder[K]) :: (c2: BCoder[V]) :: Nil => val k = Coder.beam(unwrap(options, c1)) val v = Coder.beam(unwrap(options, c2)) @@ -67,8 +66,7 @@ private[scio] object BeamCoders { val options = CoderOptions(coll.context.options) val coder = coll.internal.getCoder Some(unwrap(options, coder)) - .collect { case c: StructuredCoder[_] => c } - .map(_.getComponents.asScala.toList) + .map(_.getCoderArguments.asScala.toList) .collect { case (c1: BCoder[A]) :: (c2: BCoder[B]) :: (c3: BCoder[C]) :: Nil => val a = Coder.beam(unwrap(options, c1)) val b = Coder.beam(unwrap(options, c2)) @@ -88,8 +86,7 @@ private[scio] object BeamCoders { val options = CoderOptions(coll.context.options) val coder = coll.internal.getCoder Some(unwrap(options, coder)) - .collect { case c: StructuredCoder[_] => c } - .map(_.getComponents.asScala.toList) + .map(_.getCoderArguments.asScala.toList) .collect { case (c1: BCoder[A]) :: (c2: BCoder[B]) :: (c3: BCoder[C]) :: (c4: BCoder[D]) :: Nil => val a = Coder.beam(unwrap(options, c1)) diff --git a/scio-core/src/main/scala/com/spotify/scio/coders/CustomCoder.scala b/scio-core/src/main/scala/com/spotify/scio/coders/CustomCoder.scala index 4777ea07dd..5f3450468f 100644 --- a/scio-core/src/main/scala/com/spotify/scio/coders/CustomCoder.scala +++ b/scio-core/src/main/scala/com/spotify/scio/coders/CustomCoder.scala @@ -19,10 +19,10 @@ package com.spotify.scio.coders import java.io.{InputStream, OutputStream} import org.apache.beam.sdk.coders.Coder.NonDeterministicException -import org.apache.beam.sdk.coders.{Coder => BCoder, CustomCoder} +import org.apache.beam.sdk.coders.{Coder => BCoder, CustomCoder, StructuredCoder} import org.apache.beam.sdk.util.common.ElementByteSizeObserver -import java.util.Objects +import java.util.{List => JList, Objects} import scala.jdk.CollectionConverters._ /////////////////////////////////////////////////////////////////////////////// @@ -120,7 +120,9 @@ final private[scio] class RecordCoder[T]( val cs: IndexedSeq[(String, BCoder[Any])], construct: Seq[Any] => T, destruct: T => IndexedSeq[Any] -) extends CustomCoder[T] { +) extends StructuredCoder[T] { + + override def getCoderArguments: JList[_ <: BCoder[_]] = cs.map(_._2).asJava override def toString: String = { val body = cs.map { case (l, c) => s"$l -> $c" }.mkString(", ") diff --git a/scio-core/src/main/scala/com/spotify/scio/coders/WrappedCoder.scala b/scio-core/src/main/scala/com/spotify/scio/coders/WrappedCoder.scala index 0a81f533a9..5323fdd515 100644 --- a/scio-core/src/main/scala/com/spotify/scio/coders/WrappedCoder.scala +++ b/scio-core/src/main/scala/com/spotify/scio/coders/WrappedCoder.scala @@ -18,18 +18,23 @@ package com.spotify.scio.coders import java.io.{InputStream, OutputStream} -import org.apache.beam.sdk.coders.{Coder => BCoder, StructuredCoder} +import org.apache.beam.sdk.coders.{Coder => BCoder, CustomCoder} import org.apache.beam.sdk.util.common.ElementByteSizeObserver -import java.util.{Collections, List => JList} +import java.util.{List => JList} -/* - * */ -sealed abstract private[scio] class WrappedCoder[T] extends StructuredCoder[T] { +sealed abstract private[scio] class WrappedCoder[T] extends CustomCoder[T] { def bcoder: BCoder[T] + override def equals(obj: Any): Boolean = obj match { + case that: WrappedCoder[_] => bcoder == that.bcoder + case _ => false + } + + override def hashCode(): Int = bcoder.hashCode + override def getCoderArguments: JList[_ <: BCoder[_]] = - Collections.singletonList(bcoder) + bcoder.getCoderArguments override def encode(value: T, os: OutputStream): Unit = bcoder.encode(value, os) diff --git a/scio-core/src/main/scala/com/spotify/scio/coders/instances/AvroCoders.scala b/scio-core/src/main/scala/com/spotify/scio/coders/instances/AvroCoders.scala index 4ba9b93c9b..549404c22f 100644 --- a/scio-core/src/main/scala/com/spotify/scio/coders/instances/AvroCoders.scala +++ b/scio-core/src/main/scala/com/spotify/scio/coders/instances/AvroCoders.scala @@ -21,10 +21,14 @@ import com.spotify.scio.coders.Coder import com.spotify.scio.util.ScioUtil import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord +import org.apache.avro.io.{DatumReader, DatumWriter} +import org.apache.avro.reflect.{ReflectData, ReflectDatumReader, ReflectDatumWriter} import org.apache.avro.specific.{SpecificData, SpecificFixed, SpecificRecord} import org.apache.beam.sdk.coders.Coder.NonDeterministicException import org.apache.beam.sdk.coders.{AtomicCoder, CustomCoder, StringUtf8Coder} import org.apache.beam.sdk.extensions.avro.coders.AvroCoder +import org.apache.beam.sdk.extensions.avro.io.AvroDatumFactory +import org.apache.beam.sdk.extensions.avro.schemas.utils.AvroUtils import org.apache.beam.sdk.util.common.ElementByteSizeObserver import java.io.{InputStream, OutputStream} @@ -127,25 +131,40 @@ trait AvroCoders { def avroGenericRecordCoder: Coder[GenericRecord] = Coder.beam(new SlowGenericRecordCoder) + // Try to get the schema with SpecificData.getSchema + // This relies on private SCHEMA$ field that may not be defined on custom SpecificRecord instance + // Otherwise create a default instance and call getSchema + private def schemaForClass[T <: SpecificRecord](clazz: Class[T]): Try[Schema] = + Try(SpecificData.get().getSchema(clazz)) + .orElse(Try(clazz.getDeclaredConstructor().newInstance().getSchema)) + implicit def avroSpecificRecordCoder[T <: SpecificRecord: ClassTag]: Coder[T] = { val clazz = ScioUtil.classOf[T] + val schema = schemaForClass(clazz).getOrElse { + val msg = + "Failed to create a coder for SpecificRecord because it is impossible to retrieve an " + + s"Avro schema by instantiating $clazz. Use only a concrete type implementing " + + s"SpecificRecord or use GenericRecord type in your transformations if a concrete " + + s"type is not known in compile time." + throw new RuntimeException(msg) + } + + // same as SpecificRecordDatumFactory in scio-avro + val factory = new AvroDatumFactory(clazz) { + override def apply(writer: Schema, reader: Schema): DatumReader[T] = { + val data = new ReflectData(clazz.getClassLoader) + AvroUtils.addLogicalTypeConversions(data) + new ReflectDatumReader[T](writer, reader, data) + } - // Try to get the schema with SpecificData.getSchema - // This relies on private SCHEMA$ field that may not be defined on custom SpecificRecord instance - val schema = Try(SpecificData.get().getSchema(clazz)) - // Otherwise create a default instance and call getSchema - .orElse(Try(clazz.getDeclaredConstructor().newInstance().getSchema)) - .getOrElse { - val msg = - "Failed to create a coder for SpecificRecord because it is impossible to retrieve an " + - s"Avro schema by instantiating $clazz. Use only a concrete type implementing " + - s"SpecificRecord or use GenericRecord type in your transformations if a concrete " + - s"type is not known in compile time." - throw new RuntimeException(msg) + override def apply(writer: Schema): DatumWriter[T] = { + val data = new ReflectData(clazz.getClassLoader) + AvroUtils.addLogicalTypeConversions(data) + new ReflectDatumWriter[T](writer, data) } + } - val useReflectApi = true // keep this for backward compatibility - Coder.beam(AvroCoder.of(clazz, schema, useReflectApi)) + Coder.beam(AvroCoder.of(factory, schema)) } implicit def avroSpecificFixedCoder[T <: SpecificFixed: ClassTag]: Coder[T] = diff --git a/scio-core/src/main/scala/com/spotify/scio/transforms/syntax/SCollectionSafeSyntax.scala b/scio-core/src/main/scala/com/spotify/scio/transforms/syntax/SCollectionSafeSyntax.scala index 144db855b8..ed615374f5 100644 --- a/scio-core/src/main/scala/com/spotify/scio/transforms/syntax/SCollectionSafeSyntax.scala +++ b/scio-core/src/main/scala/com/spotify/scio/transforms/syntax/SCollectionSafeSyntax.scala @@ -32,7 +32,7 @@ trait SCollectionSafeSyntax { * Enhanced version of [[com.spotify.scio.values.SCollection SCollection]] with specialized * versions of flatMap. */ - implicit class SpecializedFlatMapSCollection[T](private val self: SCollection[T]) { + implicit class SafeFlatMapSCollection[T](private val self: SCollection[T]) { /** * Latency optimized flavor of diff --git a/scio-core/src/main/scala/com/spotify/scio/values/PairSCollectionFunctions.scala b/scio-core/src/main/scala/com/spotify/scio/values/PairSCollectionFunctions.scala index 3d993dea18..ce38622c44 100644 --- a/scio-core/src/main/scala/com/spotify/scio/values/PairSCollectionFunctions.scala +++ b/scio-core/src/main/scala/com/spotify/scio/values/PairSCollectionFunctions.scala @@ -780,7 +780,7 @@ class PairSCollectionFunctions[K, V](val self: SCollection[(K, V)]) { * Batches inputs to a desired batch size. Batches will contain only elements of a single key. * * Elements are buffered until there are batchSize elements buffered, at which point they are - * outputed to the output [[SCollection]]. + * emitted to the output [[SCollection]]. * * Windows are preserved (batches contain elements from the same window). Batches may contain * elements from more than one bundle. @@ -811,7 +811,7 @@ class PairSCollectionFunctions[K, V](val self: SCollection[(K, V)]) { * The value coder is used to determine the byte size of each element. * * Elements are buffered until there are an estimated batchByteSize bytes buffered, at which point - * they are outputed to the output [[SCollection]]. + * they are emitted to the output [[SCollection]]. * * Windows are preserved (batches contain elements from the same window). Batches may contain * elements from more than one bundle. @@ -840,7 +840,7 @@ class PairSCollectionFunctions[K, V](val self: SCollection[(K, V)]) { * * The weight of each element is computer from the provided cost function. * - * Elements are buffered until the weight is reached, at which point they are outputed to the + * Elements are buffered until the weight is reached, at which point they are emitted to the * output [[SCollection]]. * * Windows are preserved (batches contain elements from the same window). Batches may contain diff --git a/scio-core/src/main/scala/com/spotify/scio/values/PairSkewedSCollectionFunctions.scala b/scio-core/src/main/scala/com/spotify/scio/values/PairSkewedSCollectionFunctions.scala index 1676822385..95e4b5aa73 100644 --- a/scio-core/src/main/scala/com/spotify/scio/values/PairSkewedSCollectionFunctions.scala +++ b/scio-core/src/main/scala/com/spotify/scio/values/PairSkewedSCollectionFunctions.scala @@ -136,7 +136,7 @@ object SkewedJoins { } /** - * Extra functions available on SCollections of (key, value) pairs for skwed joins through an + * Extra functions available on SCollections of (key, value) pairs for skewed joins through an * implicit conversion. * * @groupname cogroup diff --git a/scio-examples/src/main/scala/com/spotify/scio/examples/extra/RedisExamples.scala b/scio-examples/src/main/scala/com/spotify/scio/examples/extra/RedisExamples.scala index 7360e6642c..68fced35b6 100644 --- a/scio-examples/src/main/scala/com/spotify/scio/examples/extra/RedisExamples.scala +++ b/scio-examples/src/main/scala/com/spotify/scio/examples/extra/RedisExamples.scala @@ -146,6 +146,7 @@ object RedisLookUpStringsExample { val connectionOptions = RedisConnectionOptions(redisHost, redisPort) sc.parallelize(Seq("key1", "key2", "unknownKey")) + // #RedisLookup_example .parDo( new RedisDoFn[String, (String, Option[String])](connectionOptions, 1000) { override def request(value: String, client: Client)(implicit @@ -156,6 +157,7 @@ object RedisLookUpStringsExample { .map { case r: List[String @unchecked] => (value, r.headOption) } } ) + // #RedisLookup_example .debug() sc.run() diff --git a/scio-extra/src/main/scala/com/spotify/scio/extra/csv/CsvIO.scala b/scio-extra/src/main/scala/com/spotify/scio/extra/csv/CsvIO.scala index ed59e8c477..77b6420ef3 100644 --- a/scio-extra/src/main/scala/com/spotify/scio/extra/csv/CsvIO.scala +++ b/scio-extra/src/main/scala/com/spotify/scio/extra/csv/CsvIO.scala @@ -88,7 +88,7 @@ import org.apache.beam.sdk.values.PCollection */ object CsvIO { - private val DefaultCsvConfiguration: CsvConfiguration = CsvConfiguration( + val DefaultCsvConfiguration: CsvConfiguration = CsvConfiguration( cellSeparator = ',', quote = '"', quotePolicy = QuotePolicy.WhenNeeded, diff --git a/scio-extra/src/main/scala/com/spotify/scio/extra/sorter/syntax/SCollectionSyntax.scala b/scio-extra/src/main/scala/com/spotify/scio/extra/sorter/syntax/SCollectionSyntax.scala index af0acda013..46e7d89d2b 100644 --- a/scio-extra/src/main/scala/com/spotify/scio/extra/sorter/syntax/SCollectionSyntax.scala +++ b/scio-extra/src/main/scala/com/spotify/scio/extra/sorter/syntax/SCollectionSyntax.scala @@ -33,14 +33,14 @@ final class SorterOps[K1, K2: SortingKey, V](self: SCollection[(K1, Iterable[(K2 /** * Takes an [[SCollection]] with elements consisting of a primary key and iterables over * (secondary key, value) pairs, and returns an [[SCollection]] of the same elements but with - * values sorted lexicographicly by the secondary key. + * values sorted lexicographically by the secondary key. * * The secondary key needs to be encoded as a [[String]] or [[Array[Byte]]. [[SortValues]] * compares bytes lexicographically and may write secondary key-value pairs to disk. * * @note * The primary key is explicit here only because this transform is typically used on a result of - * a [[PairSCollectionFunctions.groupByKey]]. + * a [[com.spotify.scio.values.PairSCollectionFunctions.groupByKey]]. * * @param memoryMB * Sets the size of the memory buffer in megabytes. This controls both the buffer for initial in diff --git a/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/client/BigQuery.scala b/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/client/BigQuery.scala index 8440d91848..bbdb764559 100644 --- a/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/client/BigQuery.scala +++ b/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/client/BigQuery.scala @@ -142,11 +142,15 @@ final class BigQuery private (val client: Client) { createDisposition ) - def createTypedTable[T <: HasAnnotation: TypeTag](table: Table): Unit = - tables.create(table.setSchema(BigQueryType[T].schema)) + def createTypedTable[T <: HasAnnotation: TypeTag](table: Table): Unit = { + val typedTable = table + .setSchema(BigQueryType[T].schema) + .setDescription(BigQueryType[T].tableDescription.orNull) + tables.create(typedTable) + } def createTypedTable[T <: HasAnnotation: TypeTag](table: TableReference): Unit = - tables.create(table, BigQueryType[T].schema) + tables.create(table, BigQueryType[T].schema, BigQueryType[T].tableDescription) def createTypedTable[T <: HasAnnotation: TypeTag](tableSpec: String): Unit = createTypedTable(beam.BigQueryHelpers.parseTableSpec(tableSpec)) diff --git a/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/client/TableOps.scala b/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/client/TableOps.scala index b3aa054ed4..82bdf11b3e 100644 --- a/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/client/TableOps.scala +++ b/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/client/TableOps.scala @@ -190,13 +190,20 @@ final private[client] class TableOps(client: Client) { b.result() } - def create(table: Table): Unit = withBigQueryService(_.createTable(table)) + def create(table: Table): Unit = + withBigQueryService(_.createTable(table)) - def create(table: TableReference, schema: TableSchema): Unit = - create(new Table().setTableReference(table).setSchema(schema)) - - def create(tableSpec: String, schema: TableSchema): Unit = - create(bq.BigQueryHelpers.parseTableSpec(tableSpec), schema) + def create( + tableRef: TableReference, + schema: TableSchema, + description: Option[String] = None + ): Unit = { + val table = new Table() + .setTableReference(tableRef) + .setSchema(schema) + .setDescription(description.orNull) + create(table) + } /** * Check if table exists. Returns `true` if table exists, `false` is table definitely does not diff --git a/scio-jdbc/src/main/java/com/spotify/scio/jdbc/JdbcDoFn.java b/scio-jdbc/src/main/java/com/spotify/scio/jdbc/JdbcDoFn.java new file mode 100644 index 0000000000..13f9918d34 --- /dev/null +++ b/scio-jdbc/src/main/java/com/spotify/scio/jdbc/JdbcDoFn.java @@ -0,0 +1,81 @@ +/* + * Copyright 2023 Spotify AB. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package com.spotify.scio.jdbc; + +import java.sql.Connection; +import java.sql.SQLException; +import javax.sql.DataSource; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.SerializableFunction; + +/** + * A {@link DoFn} that performs synchronous lookup using JDBC connection. + * + * @param input element type. + * @param JDBC lookup value type. + */ +public abstract class JdbcDoFn extends DoFn { + + private final SerializableFunction dataSourceProviderFn; + private transient DataSource dataSource; + private transient Connection connection; + + protected JdbcDoFn(SerializableFunction dataSourceProviderFn) { + this.dataSourceProviderFn = dataSourceProviderFn; + } + + @Setup + public void setup() { + dataSource = dataSourceProviderFn.apply(null); + } + + private void createConnection() throws SQLException { + if (dataSource == null) { + throw new RuntimeException("DataSourceProvider " + dataSourceProviderFn + " returned null"); + } + + connection = dataSource.getConnection(); + } + + @Teardown + public void closeConnection() throws SQLException { + if (connection != null) { + try { + connection.close(); + } finally { + connection = null; + } + } + } + + @StartBundle + public void startBundle() throws SQLException { + // recreate a connection if it is lost + if (connection == null || connection.isClosed()) { + createConnection(); + } + } + + @ProcessElement + public void processElement(@Element A input, OutputReceiver out) { + B result = lookup(connection, input); + out.output(result); + } + + public abstract B lookup(Connection connection, A input); +} diff --git a/scio-jdbc/src/main/scala/com/spotify/scio/jdbc/sharded/JdbcShardedReadOptions.scala b/scio-jdbc/src/main/scala/com/spotify/scio/jdbc/sharded/JdbcShardedReadOptions.scala index 1a1b63fc84..2bac3eb98f 100644 --- a/scio-jdbc/src/main/scala/com/spotify/scio/jdbc/sharded/JdbcShardedReadOptions.scala +++ b/scio-jdbc/src/main/scala/com/spotify/scio/jdbc/sharded/JdbcShardedReadOptions.scala @@ -28,7 +28,8 @@ import com.spotify.scio.jdbc.JdbcConnectionOptions * @param tableName * Name of a table or materialized view to read from * @param shardColumn - * Column to shard by. Must be of integer/long type ideally with evenly distributed values + * Column to shard by. Should ideally have evenly distributed values. Column type must have a + * corresponding [[com.spotify.scio.jdbc.sharded.Shard]] implementation. * @param rowMapper * Function to map from a SQL [[java.sql.ResultSet]] to `T` * @param fetchSize diff --git a/scio-jdbc/src/main/scala/com/spotify/scio/jdbc/syntax/ScioContextSyntax.scala b/scio-jdbc/src/main/scala/com/spotify/scio/jdbc/syntax/ScioContextSyntax.scala index 472890f1f3..8dfcdb6470 100644 --- a/scio-jdbc/src/main/scala/com/spotify/scio/jdbc/syntax/ScioContextSyntax.scala +++ b/scio-jdbc/src/main/scala/com/spotify/scio/jdbc/syntax/ScioContextSyntax.scala @@ -100,8 +100,7 @@ final class JdbcScioContextOps(private val self: ScioContext) extends AnyVal { * tableName: name of a table or materialized view to read from * * fetchSize: number of records to read from the JDBC source per one call to a database. Default - * value is 100,000. Set to - * -1 to make it unbounded. shard: An implementation of the + * value is 100,000. Set to -1 to make it unbounded. shard: An implementation of the * [[com.spotify.scio.jdbc.sharded.Shard]] trait which knows how to shard a column of a type S. * Example of sharding by a column of type Long: * {{{ diff --git a/scio-jdbc/src/test/scala/com/spotify/scio/jdbc/JdbcDoFnTest.scala b/scio-jdbc/src/test/scala/com/spotify/scio/jdbc/JdbcDoFnTest.scala new file mode 100644 index 0000000000..5bddce93ec --- /dev/null +++ b/scio-jdbc/src/test/scala/com/spotify/scio/jdbc/JdbcDoFnTest.scala @@ -0,0 +1,69 @@ +/* + * Copyright 2023 Spotify AB. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package com.spotify.scio.jdbc + +import com.spotify.scio.testing.PipelineSpec +import org.apache.commons.lang3.SerializationUtils + +import java.io.PrintWriter +import java.sql.Connection +import java.util.logging.Logger +import javax.sql.DataSource + +class JdbcDoFnTest extends PipelineSpec { + + "JdbcDoFn" should "return results from lookup func" in { + val doFn = new JdbcDoFn[Int, String](_ => new DataSourceMock()) { + override def lookup(connection: Connection, input: Int): String = + input.toString + } + + val output = runWithData(1 to 10)(_.parDo(doFn)) + output should contain theSameElementsAs (1 to 10).map(_.toString) + } + + "JdbcDoFn" should "be serializable" in { + val doFn = new JdbcDoFn[Int, String](_ => new DataSourceMock()) { + override def lookup(connection: Connection, input: Int): String = ??? + } + doFn.setup() + doFn.startBundle() + + SerializationUtils.serialize(doFn) + } +} + +class DataSourceMock extends DataSource { + override def getConnection: Connection = null + + override def getConnection(username: String, password: String): Connection = ??? + + override def getLogWriter: PrintWriter = ??? + + override def setLogWriter(out: PrintWriter): Unit = ??? + + override def setLoginTimeout(seconds: Int): Unit = ??? + + override def getLoginTimeout: Int = ??? + + override def getParentLogger: Logger = ??? + + override def unwrap[T](iface: Class[T]): T = ??? + + override def isWrapperFor(iface: Class[_]): Boolean = ??? +} diff --git a/scio-neo4j/src/it/scala/com/spotify/scio/neo4j/Neo4jIOIT.scala b/scio-neo4j/src/it/scala/com/spotify/scio/neo4j/Neo4jIOIT.scala index 196e1dfbe5..6bf7b7a038 100644 --- a/scio-neo4j/src/it/scala/com/spotify/scio/neo4j/Neo4jIOIT.scala +++ b/scio-neo4j/src/it/scala/com/spotify/scio/neo4j/Neo4jIOIT.scala @@ -2,13 +2,13 @@ package com.spotify.scio.neo4j import com.dimafeng.testcontainers.{ForAllTestContainer, Neo4jContainer} import com.spotify.scio.testing.PipelineSpec -import org.apache.beam.runners.direct.DirectRunner import org.apache.beam.sdk.options.PipelineOptionsFactory import org.neo4j.driver.{AuthTokens, Driver, GraphDatabase} import org.scalatest.concurrent.Eventually import org.testcontainers.utility.DockerImageName import scala.jdk.CollectionConverters._ +import org.apache.beam.sdk.options.PipelineOptions object Neo4jIOIT { @@ -69,14 +69,14 @@ class Neo4jIOIT extends PipelineSpec with Eventually with ForAllTestContainer { } finally session.close() } - val martin = Person("Martin Sheen") - val morgan = Person("Morgan Freeman") - val michael = Person("Michael Douglas") + val martin: Person = Person("Martin Sheen") + val morgan: Person = Person("Morgan Freeman") + val michael: Person = Person("Michael Douglas") - val americanPresident = Movie("American President", 1995) + val americanPresident: Movie = Movie("American President", 1995) - val options = PipelineOptionsFactory.create() - lazy val neo4jOptions = Neo4jOptions( + val options: PipelineOptions = PipelineOptionsFactory.create() + lazy val neo4jOptions: Neo4jOptions = Neo4jOptions( Neo4jConnectionOptions(container.boltUrl, container.username, container.password) ) diff --git a/scio-parquet/src/main/scala/com/spotify/scio/parquet/avro/ParquetAvroIO.scala b/scio-parquet/src/main/scala/com/spotify/scio/parquet/avro/ParquetAvroIO.scala index 8fa9ce29e0..d9ebd06adf 100644 --- a/scio-parquet/src/main/scala/com/spotify/scio/parquet/avro/ParquetAvroIO.scala +++ b/scio-parquet/src/main/scala/com/spotify/scio/parquet/avro/ParquetAvroIO.scala @@ -213,12 +213,7 @@ object ParquetAvroIO { } } - val useSplittableDoFn = jobConf.getBoolean( - ParquetReadConfiguration.UseSplittableDoFn, - ParquetReadConfiguration.UseSplittableDoFnDefault - ) - - if (useSplittableDoFn) { + if (ParquetReadConfiguration.getUseSplittableDoFn(jobConf, sc.options)) { readSplittableDoFn(sc, jobConf, path) } else { readLegacy(sc, jobConf, path) diff --git a/scio-parquet/src/main/scala/com/spotify/scio/parquet/read/ParquetReadConfiguration.scala b/scio-parquet/src/main/scala/com/spotify/scio/parquet/read/ParquetReadConfiguration.scala index e138254a84..defe3ec24c 100644 --- a/scio-parquet/src/main/scala/com/spotify/scio/parquet/read/ParquetReadConfiguration.scala +++ b/scio-parquet/src/main/scala/com/spotify/scio/parquet/read/ParquetReadConfiguration.scala @@ -16,7 +16,12 @@ package com.spotify.scio.parquet.read +import org.apache.beam.sdk.options.{ExperimentalOptions, PipelineOptions} +import org.apache.hadoop.conf.Configuration +import org.slf4j.LoggerFactory + object ParquetReadConfiguration { + private val log = LoggerFactory.getLogger(getClass) // Key val SplitGranularity = "scio.parquet.read.splitgranularity" @@ -36,4 +41,21 @@ object ParquetReadConfiguration { // SplittableDoFn val UseSplittableDoFn = "scio.parquet.read.useSplittableDoFn" private[scio] val UseSplittableDoFnDefault = false + + private[scio] def getUseSplittableDoFn(conf: Configuration, opts: PipelineOptions): Boolean = { + Option(conf.get(UseSplittableDoFn)) match { + case Some(v) => v.toBoolean + case None if dataflowRunnerV2Enabled(opts) => + log.info( + "Defaulting to SplittableDoFn-based Parquet read as Dataflow Runner V2 is enabled. To opt out, " + + "set `scio.parquet.read.useSplittableDoFn -> false` in your read Configuration." + ) + true + case None => + UseSplittableDoFnDefault + } + } + + private def dataflowRunnerV2Enabled(opts: PipelineOptions): Boolean = + Option(opts.as(classOf[ExperimentalOptions]).getExperiments).exists(_.contains("use_runner_v2")) } diff --git a/scio-parquet/src/main/scala/com/spotify/scio/parquet/tensorflow/ParquetExampleIO.scala b/scio-parquet/src/main/scala/com/spotify/scio/parquet/tensorflow/ParquetExampleIO.scala index b086a936e4..08b933a95d 100644 --- a/scio-parquet/src/main/scala/com/spotify/scio/parquet/tensorflow/ParquetExampleIO.scala +++ b/scio-parquet/src/main/scala/com/spotify/scio/parquet/tensorflow/ParquetExampleIO.scala @@ -57,12 +57,8 @@ final case class ParquetExampleIO(path: String) extends ScioIO[Example] { override protected def read(sc: ScioContext, params: ReadP): SCollection[Example] = { val conf = ParquetConfiguration.ofNullable(params.conf) - val useSplittableDoFn = conf.getBoolean( - ParquetReadConfiguration.UseSplittableDoFn, - ParquetReadConfiguration.UseSplittableDoFnDefault - ) - if (useSplittableDoFn) { + if (ParquetReadConfiguration.getUseSplittableDoFn(conf, sc.options)) { readSplittableDoFn(sc, conf, params) } else { readLegacy(sc, conf, params) diff --git a/scio-parquet/src/main/scala/com/spotify/scio/parquet/types/ParquetTypeIO.scala b/scio-parquet/src/main/scala/com/spotify/scio/parquet/types/ParquetTypeIO.scala index 8ebcdb3815..00f0b2b88e 100644 --- a/scio-parquet/src/main/scala/com/spotify/scio/parquet/types/ParquetTypeIO.scala +++ b/scio-parquet/src/main/scala/com/spotify/scio/parquet/types/ParquetTypeIO.scala @@ -55,12 +55,8 @@ final case class ParquetTypeIO[T: ClassTag: Coder: ParquetType]( override protected def read(sc: ScioContext, params: ReadP): SCollection[T] = { val conf = ParquetConfiguration.ofNullable(params.conf) - val useSplittableDoFn = conf.getBoolean( - ParquetReadConfiguration.UseSplittableDoFn, - ParquetReadConfiguration.UseSplittableDoFnDefault - ) - if (useSplittableDoFn) { + if (ParquetReadConfiguration.getUseSplittableDoFn(conf, sc.options)) { readSplittableDoFn(sc, conf, params) } else { readLegacy(sc, conf, params) diff --git a/scio-parquet/src/test/scala/com/spotify/scio/parquet/avro/ParquetAvroIOTest.scala b/scio-parquet/src/test/scala/com/spotify/scio/parquet/avro/ParquetAvroIOTest.scala index c8230c7a70..80d400d6d4 100644 --- a/scio-parquet/src/test/scala/com/spotify/scio/parquet/avro/ParquetAvroIOTest.scala +++ b/scio-parquet/src/test/scala/com/spotify/scio/parquet/avro/ParquetAvroIOTest.scala @@ -59,7 +59,7 @@ class ParquetAvroIOFileNamePolicyTest extends FileNamePolicySpec[TestRecord] { ) } - override def failSaves = Seq( + override def failSaves: Seq[SCollection[Int] => ClosedTap[TestRecord]] = Seq( _.map(AvroUtils.newSpecificRecord).saveAsParquetAvroFile( "nonsense", shardNameTemplate = "SSS-of-NNN", diff --git a/scio-parquet/src/test/scala/com/spotify/scio/parquet/read/ParquetReadFnTest.scala b/scio-parquet/src/test/scala/com/spotify/scio/parquet/read/ParquetReadFnTest.scala index cadaa1ff92..f08ca0ab0b 100644 --- a/scio-parquet/src/test/scala/com/spotify/scio/parquet/read/ParquetReadFnTest.scala +++ b/scio-parquet/src/test/scala/com/spotify/scio/parquet/read/ParquetReadFnTest.scala @@ -25,6 +25,7 @@ import com.spotify.scio.parquet.types._ import com.spotify.scio.testing.PipelineSpec import org.apache.commons.io.FileUtils import org.apache.avro.generic.{GenericRecord, GenericRecordBuilder} +import org.apache.beam.sdk.options.PipelineOptionsFactory import org.apache.beam.sdk.util.SerializableUtils import org.apache.parquet.filter2.predicate.FilterApi import org.apache.parquet.io.api.Binary @@ -318,6 +319,36 @@ class ParquetReadFnTest extends PipelineSpec with BeforeAndAfterAll { ) } + "ParquetReadConfiguration" should "default to using splittableDoFn only if RunnerV2 experiment is enabled" in { + // Default to true if RunnerV2 is set and user hasn't configured SDF explicitly + ParquetReadConfiguration.getUseSplittableDoFn( + ParquetConfiguration.empty(), + PipelineOptionsFactory.fromArgs("--experiments=use_runner_v2,another_experiment").create() + ) shouldBe true + + // Default to false if RunnerV2 is not set + ParquetReadConfiguration.getUseSplittableDoFn( + ParquetConfiguration.empty(), + PipelineOptionsFactory.fromArgs("--experiments=another_experiment").create() + ) shouldBe false + + ParquetReadConfiguration.getUseSplittableDoFn( + ParquetConfiguration.empty(), + PipelineOptionsFactory.fromArgs().create() + ) shouldBe false + + // Respect user's configuration, if set + ParquetReadConfiguration.getUseSplittableDoFn( + ParquetConfiguration.of(ParquetReadConfiguration.UseSplittableDoFn -> false), + PipelineOptionsFactory.fromArgs("--experiments=use_runner_v2").create() + ) shouldBe false + + ParquetReadConfiguration.getUseSplittableDoFn( + ParquetConfiguration.of(ParquetReadConfiguration.UseSplittableDoFn -> true), + PipelineOptionsFactory.fromArgs().create() + ) shouldBe true + } + private def listFiles(dir: String): Seq[String] = Files .list(Paths.get(dir)) diff --git a/scio-parquet/src/test/scala/com/spotify/scio/parquet/tensorflow/ParquetExampleIOTest.scala b/scio-parquet/src/test/scala/com/spotify/scio/parquet/tensorflow/ParquetExampleIOTest.scala index 7877a5917c..b935cec80d 100644 --- a/scio-parquet/src/test/scala/com/spotify/scio/parquet/tensorflow/ParquetExampleIOTest.scala +++ b/scio-parquet/src/test/scala/com/spotify/scio/parquet/tensorflow/ParquetExampleIOTest.scala @@ -111,7 +111,7 @@ class ParquetExampleIOFileNamePolicyTest extends FileNamePolicySpec[Example] { ) } - override def failSaves = Seq( + override def failSaves: Seq[SCollection[Int] => ClosedTap[Example]] = Seq( _.map(newExample).saveAsParquetExampleFile( "nonsense", schema, diff --git a/scio-smb/src/it/scala/com/spotify/scio/smb/SortMergeBucketParityIT.scala b/scio-smb/src/it/scala/com/spotify/scio/smb/SortMergeBucketParityIT.scala index 382b6f9310..b5e55269f6 100644 --- a/scio-smb/src/it/scala/com/spotify/scio/smb/SortMergeBucketParityIT.scala +++ b/scio-smb/src/it/scala/com/spotify/scio/smb/SortMergeBucketParityIT.scala @@ -17,7 +17,7 @@ package com.spotify.scio.smb -import java.nio.file.{Files, Path} +import java.nio.file.Files import com.spotify.scio.ScioContext import com.spotify.scio.avro._ import com.spotify.scio.coders.Coder diff --git a/scio-smb/src/main/java/org/apache/beam/sdk/extensions/smb/AvroFileOperations.java b/scio-smb/src/main/java/org/apache/beam/sdk/extensions/smb/AvroFileOperations.java index dc174f6b11..42674829d1 100644 --- a/scio-smb/src/main/java/org/apache/beam/sdk/extensions/smb/AvroFileOperations.java +++ b/scio-smb/src/main/java/org/apache/beam/sdk/extensions/smb/AvroFileOperations.java @@ -31,6 +31,7 @@ import org.apache.avro.io.DatumReader; import org.apache.avro.reflect.ReflectData; import org.apache.avro.reflect.ReflectDatumReader; +import org.apache.avro.reflect.ReflectDatumWriter; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.extensions.avro.coders.AvroCoder; import org.apache.beam.sdk.extensions.avro.io.AvroIO; @@ -121,7 +122,16 @@ public GenericRecord formatRecord(ValueT element, Schema schema) { } }) .withCodec(codec.getCodec()) - : AvroIO.sink(recordClass).withCodec(codec.getCodec()); + : AvroIO.sink(recordClass) + .withCodec(codec.getCodec()) + .withDatumWriterFactory( + (writer) -> { + // same as SpecificRecordDatumFactory in scio-avro + ReflectData data = new ReflectData(recordClass.getClassLoader()); + org.apache.beam.sdk.extensions.avro.schemas.utils.AvroUtils + .addLogicalTypeConversions(data); + return new ReflectDatumWriter<>(writer, data); + }); if (metadata != null) { return sink.withMetadata(metadata); @@ -193,10 +203,15 @@ private static class AvroReader extends FileOperations.Reader { public void prepareRead(ReadableByteChannel channel) throws IOException { final Schema schema = schemaSupplier.get(); - DatumReader datumReader = - recordClass == null - ? new GenericDatumReader<>(schema) - : new ReflectDatumReader<>(recordClass); + DatumReader datumReader; + if (recordClass == null) { + datumReader = new GenericDatumReader<>(schema); + } else { + // same as SpecificRecordDatumFactory in scio-avro + ReflectData data = new ReflectData(recordClass.getClassLoader()); + org.apache.beam.sdk.extensions.avro.schemas.utils.AvroUtils.addLogicalTypeConversions(data); + datumReader = new ReflectDatumReader<>(data); + } reader = new DataFileStream<>(Channels.newInputStream(channel), datumReader); } diff --git a/scio-tensorflow/src/main/scala/com/spotify/scio/tensorflow/syntax/SCollectionSyntax.scala b/scio-tensorflow/src/main/scala/com/spotify/scio/tensorflow/syntax/SCollectionSyntax.scala index 44573b27de..91d4bcdcfc 100644 --- a/scio-tensorflow/src/main/scala/com/spotify/scio/tensorflow/syntax/SCollectionSyntax.scala +++ b/scio-tensorflow/src/main/scala/com/spotify/scio/tensorflow/syntax/SCollectionSyntax.scala @@ -57,7 +57,7 @@ final class PredictSCollectionOps[T](private val self: SCollection[T]) { * @param signatureName * name of [[org.tensorflow.framework.SignatureDef]] s to be used to run the prediction. */ - def predict[V: Coder, W]( + def predict[V: Coder]( savedModelUri: String, fetchOps: Seq[String], options: TensorFlowModel.Options, @@ -88,7 +88,7 @@ final class PredictSCollectionOps[T](private val self: SCollection[T]) { * @param signatureName * name of [[org.tensorflow.framework.SignatureDef]] s to be used to run the prediction. */ - def predictWithSigDef[V: Coder, W]( + def predictWithSigDef[V: Coder]( savedModelUri: String, options: TensorFlowModel.Options, fetchOps: Option[Seq[String]] = PredictSCollectionOps.DefaultFetchOps, diff --git a/scio-tensorflow/src/test/scala/com/spotify/scio/tensorflow/TFExampleIOTest.scala b/scio-tensorflow/src/test/scala/com/spotify/scio/tensorflow/TFExampleIOTest.scala index 3e29189c6e..743972c5a8 100644 --- a/scio-tensorflow/src/test/scala/com/spotify/scio/tensorflow/TFExampleIOTest.scala +++ b/scio-tensorflow/src/test/scala/com/spotify/scio/tensorflow/TFExampleIOTest.scala @@ -61,7 +61,7 @@ class TFExampleIOFileNamePolicyTest extends FileNamePolicySpec[Example] { ) } - override def failSaves = Seq( + override def failSaves: Seq[SCollection[Int] => ClosedTap[Example]] = Seq( _.map(x => recordT(Record(x, x.toString))).saveAsTfRecordFile( "nonsense", shardNameTemplate = "SSS-of-NNN", diff --git a/scio-test/src/main/scala/com/spotify/scio/testing/ApproximationAssertions.scala b/scio-test/src/main/scala/com/spotify/scio/testing/ApproximationAssertions.scala index 4bb455398b..44884d3294 100644 --- a/scio-test/src/main/scala/com/spotify/scio/testing/ApproximationAssertions.scala +++ b/scio-test/src/main/scala/com/spotify/scio/testing/ApproximationAssertions.scala @@ -64,9 +64,9 @@ object ApproximationAssertions { } /** - * Check corresponding expected value is off by error percentage. - * i.e. if actual value is `A`, expected values is `B` with error percentage `E`, then assert - * following. (B - ((B / 100) * E)) <= A <= (B + ((B / 100) * E) + * Check corresponding expected value is off by error percentage. i.e. if actual value is `A`, + * expected values is `B` with error percentage `E`, then assert following. (B - ((B / 100) * E)) + * <= A <= (B + ((B / 100) * E) * * Assert above for each element pair. * @param expected @@ -90,9 +90,8 @@ object ApproximationAssertions { /** * Similar to above but works with tuples. Check corresponding expected value is off by error - * percentage. - * i.e. if acutal value is `A`, expected values is `B` with error percentage `E`, then assert - * following. (B - ((B / 100) * E)) <= A <= (B + ((B / 100) * E) + * percentage. i.e. if acutal value is `A`, expected values is `B` with error percentage `E`, then + * assert following. (B - ((B / 100) * E)) <= A <= (B + ((B / 100) * E) * * Assert above for each key in the actual. * @param expected diff --git a/scio-test/src/main/scala/com/spotify/scio/testing/TransformOverride.scala b/scio-test/src/main/scala/com/spotify/scio/testing/TransformOverride.scala index 6554ad05aa..654fbca61c 100644 --- a/scio-test/src/main/scala/com/spotify/scio/testing/TransformOverride.scala +++ b/scio-test/src/main/scala/com/spotify/scio/testing/TransformOverride.scala @@ -76,11 +76,16 @@ object TransformOverride { java.lang.Double.TYPE -> classOf[java.lang.Double] ) - private def typeValidation[A, B](failMsg: String, aIn: Class[A], bIn: Class[B]): Unit = { + private def typeValidation[A, B]( + expectedIn: Class[B], + actualIn: Class[A], + failMsg: String + ): Unit = { // get normal java types instead of primitives - val (a, b) = (primitiveMapping.getOrElse(aIn, aIn), primitiveMapping.getOrElse(bIn, bIn)) - if (!a.isAssignableFrom(b)) - throw new IllegalArgumentException(s"$failMsg Expected: ${aIn} Found: ${bIn}") + val expected = primitiveMapping.getOrElse(expectedIn, expectedIn) + val actual = primitiveMapping.getOrElse(actualIn, actualIn) + if (!expected.isAssignableFrom(actual)) + throw new IllegalArgumentException(s"$failMsg Expected: $expected Found: $actual") } /** @@ -105,9 +110,9 @@ object TransformOverride { def of[T: ClassTag, U](name: String, fn: T => U): PTransformOverride = { val wrappedFn: T => U = fn.compose { t: T => typeValidation( - s"Input for override transform $name does not match pipeline transform.", + implicitly[ClassTag[T]].runtimeClass, t.getClass, - implicitly[ClassTag[T]].runtimeClass + s"Input for override transform $name does not match pipeline transform." ) t } @@ -132,9 +137,9 @@ object TransformOverride { val wrappedFn: T => JIterable[U] = fn .compose { t: T => typeValidation( - s"Input for override transform $name does not match pipeline transform.", + implicitly[ClassTag[T]].runtimeClass, t.getClass, - implicitly[ClassTag[T]].runtimeClass + s"Input for override transform $name does not match pipeline transform." ) t } diff --git a/scio-test/src/test/scala/com/spotify/scio/coders/BeamCodersTest.scala b/scio-test/src/test/scala/com/spotify/scio/coders/BeamCodersTest.scala new file mode 100644 index 0000000000..fda3d009a3 --- /dev/null +++ b/scio-test/src/test/scala/com/spotify/scio/coders/BeamCodersTest.scala @@ -0,0 +1,128 @@ +/* + * Copyright 2023 Spotify AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.spotify.scio.coders + +import com.spotify.scio.ScioContext +import org.apache.beam.sdk.coders.{ + BigEndianShortCoder, + ByteCoder, + Coder => BCoder, + StringUtf8Coder, + StructuredCoder, + VarIntCoder +} +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers +import org.scalatest.prop.TableDrivenPropertyChecks + +import java.io.{InputStream, OutputStream} +import java.util.{Arrays => JArrays, List => JList} + +object BeamCodersTest { + class CustomKeyValueCoder[K, V](keyCoder: BCoder[K], valueCoder: BCoder[V]) + extends StructuredCoder[(String, Int)] { + override def encode(value: (String, Int), outStream: OutputStream): Unit = ??? + override def decode(inStream: InputStream): (String, Int) = ??? + override def getCoderArguments: JList[_ <: BCoder[_]] = + JArrays.asList(keyCoder, valueCoder) + override def verifyDeterministic(): Unit = ??? + } +} + +class BeamCodersTest extends AnyFlatSpec with Matchers with TableDrivenPropertyChecks { + + import BeamCodersTest._ + + "BeamCoders" should "get scio coder from SCollection" in { + val sc = ScioContext() + val coll = sc.empty[String]() + val coder = BeamCoders.getCoder(coll) + coder shouldBe a[Beam[_]] + val beamKeyCoder = coder.asInstanceOf[Beam[_]] + beamKeyCoder.beam shouldBe StringUtf8Coder.of() + } + + it should "get scio coders from tupled2 SCollection" in { + val coders = Table[Coder[(String, Int)]]( + "coder", + Coder.tuple2Coder, + Coder.gen, + Coder.beam(new CustomKeyValueCoder(StringUtf8Coder.of(), VarIntCoder.of())) + ) + + forAll(coders) { coder => + val sc = ScioContext() + val coll = sc.empty()(coder) + val (keyCoder, valueCoder) = BeamCoders.getTupleCoders(coll) + keyCoder shouldBe a[Beam[_]] + val beamKeyCoder = keyCoder.asInstanceOf[Beam[_]] + beamKeyCoder.beam shouldBe StringUtf8Coder.of() + valueCoder shouldBe a[Beam[_]] + val beamValueCoder = valueCoder.asInstanceOf[Beam[_]] + beamValueCoder.beam shouldBe VarIntCoder.of() + } + } + + it should "get scio coders from tupled3 SCollection" in { + val coders = Table[Coder[(String, Int, Short)]]( + "coder", + Coder.tuple3Coder, + Coder.gen + ) + + forAll(coders) { coder => + val sc = ScioContext() + val coll = sc.empty()(coder) + val (c1, c2, c3) = BeamCoders.getTuple3Coders(coll) + c1 shouldBe a[Beam[_]] + val beamCoder1 = c1.asInstanceOf[Beam[_]] + beamCoder1.beam shouldBe StringUtf8Coder.of() + c2 shouldBe a[Beam[_]] + val beamCoder2 = c2.asInstanceOf[Beam[_]] + beamCoder2.beam shouldBe VarIntCoder.of() + c3 shouldBe a[Beam[_]] + val beamCoder3 = c3.asInstanceOf[Beam[_]] + beamCoder3.beam shouldBe BigEndianShortCoder.of() + } + } + + it should "get scio coders from tupled4 SCollection" in { + val coders = Table[Coder[(String, Int, Short, Byte)]]( + "coder", + Coder.tuple4Coder, + Coder.gen + ) + + forAll(coders) { coder => + val sc = ScioContext() + val coll = sc.empty()(coder) + val (c1, c2, c3, c4) = BeamCoders.getTuple4Coders(coll) + c1 shouldBe a[Beam[_]] + val beamCoder1 = c1.asInstanceOf[Beam[_]] + beamCoder1.beam shouldBe StringUtf8Coder.of() + c2 shouldBe a[Beam[_]] + val beamCoder2 = c2.asInstanceOf[Beam[_]] + beamCoder2.beam shouldBe VarIntCoder.of() + c3 shouldBe a[Beam[_]] + val beamCoder3 = c3.asInstanceOf[Beam[_]] + beamCoder3.beam shouldBe BigEndianShortCoder.of() + c4 shouldBe a[Beam[_]] + val beamCoder4 = c4.asInstanceOf[Beam[_]] + beamCoder4.beam shouldBe ByteCoder.of() + } + } +} diff --git a/scio-test/src/test/scala/com/spotify/scio/transforms/SpecializedFlatMapSCollectionTest.scala b/scio-test/src/test/scala/com/spotify/scio/transforms/SafeFlatMapSCollectionTest.scala similarity index 95% rename from scio-test/src/test/scala/com/spotify/scio/transforms/SpecializedFlatMapSCollectionTest.scala rename to scio-test/src/test/scala/com/spotify/scio/transforms/SafeFlatMapSCollectionTest.scala index 1f56c4c539..0576909a75 100644 --- a/scio-test/src/test/scala/com/spotify/scio/transforms/SpecializedFlatMapSCollectionTest.scala +++ b/scio-test/src/test/scala/com/spotify/scio/transforms/SafeFlatMapSCollectionTest.scala @@ -19,7 +19,7 @@ package com.spotify.scio.transforms import com.spotify.scio.testing.PipelineSpec -class SpecializedFlatMapSCollectionTest extends PipelineSpec { +class SafeFlatMapSCollectionTest extends PipelineSpec { "SpecializedFlatMapSCollectionTest" should "support safeFlatMap()" in { val errorMsg = "String contains 'a'" runWithContext { sc => diff --git a/scio-test/src/test/scala/com/spotify/scio/values/PairSCollectionFunctionsTest.scala b/scio-test/src/test/scala/com/spotify/scio/values/PairSCollectionFunctionsTest.scala index 0e88697c51..1b0bda3f3e 100644 --- a/scio-test/src/test/scala/com/spotify/scio/values/PairSCollectionFunctionsTest.scala +++ b/scio-test/src/test/scala/com/spotify/scio/values/PairSCollectionFunctionsTest.scala @@ -17,6 +17,7 @@ package com.spotify.scio.values +import com.spotify.scio.ScioContext import com.spotify.scio.coders.{Beam, MaterializedCoder} import com.spotify.scio.testing.PipelineSpec import com.spotify.scio.util.random.RandomSamplerUtils @@ -30,56 +31,54 @@ import scala.collection.mutable class PairSCollectionFunctionsTest extends PipelineSpec { "PairSCollection" should "propagates unwrapped coders" in { - runWithContext { sc => - val coll = sc.empty[(String, Int)]() - // internal is wrapped - val internalCoder = coll.internal.getCoder - internalCoder shouldBe a[MaterializedCoder[_]] - val materializedCoder = internalCoder.asInstanceOf[MaterializedCoder[_]] - materializedCoder.bcoder shouldBe a[StructuredCoder[_]] - val tupleCoder = materializedCoder.bcoder.asInstanceOf[StructuredCoder[_]] - val keyCoder = tupleCoder.getComponents.get(0) - keyCoder shouldBe StringUtf8Coder.of() - val valueCoder = tupleCoder.getComponents.get(1) - valueCoder shouldBe VarIntCoder.of() - // implicit SCollection key and value coder aren't - coll.keyCoder shouldBe a[Beam[_]] - val beamKeyCoder = coll.keyCoder.asInstanceOf[Beam[_]] - beamKeyCoder.beam shouldBe StringUtf8Coder.of() - - coll.valueCoder shouldBe a[Beam[_]] - val beamValueCoder = coll.valueCoder.asInstanceOf[Beam[_]] - beamValueCoder.beam shouldBe VarIntCoder.of() - } + val sc = ScioContext() + val coll = sc.empty[(String, Int)]() + // internal is wrapped + val internalCoder = coll.internal.getCoder + internalCoder shouldBe a[MaterializedCoder[_]] + val materializedCoder = internalCoder.asInstanceOf[MaterializedCoder[_]] + materializedCoder.bcoder shouldBe a[StructuredCoder[_]] + val tupleCoder = materializedCoder.bcoder.asInstanceOf[StructuredCoder[_]] + val keyCoder = tupleCoder.getComponents.get(0) + keyCoder shouldBe StringUtf8Coder.of() + val valueCoder = tupleCoder.getComponents.get(1) + valueCoder shouldBe VarIntCoder.of() + // implicit SCollection key and value coder aren't + coll.keyCoder shouldBe a[Beam[_]] + val beamKeyCoder = coll.keyCoder.asInstanceOf[Beam[_]] + beamKeyCoder.beam shouldBe StringUtf8Coder.of() + + coll.valueCoder shouldBe a[Beam[_]] + val beamValueCoder = coll.valueCoder.asInstanceOf[Beam[_]] + beamValueCoder.beam shouldBe VarIntCoder.of() } it should "propagate unwrapped nullable coders" in { - runWithContext { sc => - sc.optionsAs[ScioOptions].setNullableCoders(true) - - val coll = sc.empty[(String, Int)]() - // internal is wrapped - val internalCoder = coll.internal.getCoder - internalCoder shouldBe a[MaterializedCoder[_]] - val materializedCoder = internalCoder.asInstanceOf[MaterializedCoder[_]] - materializedCoder.bcoder shouldBe a[NullableCoder[_]] - val nullableTupleCoder = materializedCoder.bcoder.asInstanceOf[NullableCoder[_]] - val tupleCoder = nullableTupleCoder.getValueCoder.asInstanceOf[StructuredCoder[_]] - val keyCoder = tupleCoder.getComponents.get(0) - keyCoder shouldBe a[NullableCoder[_]] - keyCoder.asInstanceOf[NullableCoder[_]].getValueCoder shouldBe StringUtf8Coder.of() - val valueCoder = tupleCoder.getComponents.get(1) - valueCoder shouldBe a[NullableCoder[_]] - valueCoder.asInstanceOf[NullableCoder[_]].getValueCoder shouldBe VarIntCoder.of() - // implicit SCollection key and value coder aren't - coll.keyCoder shouldBe a[Beam[_]] - val beamKeyCoder = coll.keyCoder.asInstanceOf[Beam[_]] - beamKeyCoder.beam shouldBe StringUtf8Coder.of() - - coll.valueCoder shouldBe a[Beam[_]] - val beamValueCoder = coll.valueCoder.asInstanceOf[Beam[_]] - beamValueCoder.beam shouldBe VarIntCoder.of() - } + val sc = ScioContext() + sc.optionsAs[ScioOptions].setNullableCoders(true) + + val coll = sc.empty[(String, Int)]() + // internal is wrapped + val internalCoder = coll.internal.getCoder + internalCoder shouldBe a[MaterializedCoder[_]] + val materializedCoder = internalCoder.asInstanceOf[MaterializedCoder[_]] + materializedCoder.bcoder shouldBe a[NullableCoder[_]] + val nullableTupleCoder = materializedCoder.bcoder.asInstanceOf[NullableCoder[_]] + val tupleCoder = nullableTupleCoder.getValueCoder.asInstanceOf[StructuredCoder[_]] + val keyCoder = tupleCoder.getComponents.get(0) + keyCoder shouldBe a[NullableCoder[_]] + keyCoder.asInstanceOf[NullableCoder[_]].getValueCoder shouldBe StringUtf8Coder.of() + val valueCoder = tupleCoder.getComponents.get(1) + valueCoder shouldBe a[NullableCoder[_]] + valueCoder.asInstanceOf[NullableCoder[_]].getValueCoder shouldBe VarIntCoder.of() + // implicit SCollection key and value coder aren't + coll.keyCoder shouldBe a[Beam[_]] + val beamKeyCoder = coll.keyCoder.asInstanceOf[Beam[_]] + beamKeyCoder.beam shouldBe StringUtf8Coder.of() + + coll.valueCoder shouldBe a[Beam[_]] + val beamValueCoder = coll.valueCoder.asInstanceOf[Beam[_]] + beamValueCoder.beam shouldBe VarIntCoder.of() } it should "support cogroup()" in { diff --git a/scio-test/src/test/scala/com/spotify/scio/values/SCollectionTest.scala b/scio-test/src/test/scala/com/spotify/scio/values/SCollectionTest.scala index 89cc4ca5c0..8c5db99995 100644 --- a/scio-test/src/test/scala/com/spotify/scio/values/SCollectionTest.scala +++ b/scio-test/src/test/scala/com/spotify/scio/values/SCollectionTest.scala @@ -55,39 +55,36 @@ class SCollectionTest extends PipelineSpec { import SCollectionTest._ "SCollection" should "propagates unwrapped coders" in { - runWithContext { sc => - val coll = sc.empty[String]() - // internal is wrapped - val internalCoder = coll.internal.getCoder - internalCoder shouldBe a[MaterializedCoder[_]] - val materializedCoder = internalCoder.asInstanceOf[MaterializedCoder[_]] - materializedCoder.bcoder shouldBe StringUtf8Coder.of() - // implicit SCollection coder is not - val scioCoder = coll.coder - scioCoder shouldBe a[Beam[_]] - val beamCoder = scioCoder.asInstanceOf[Beam[_]] - beamCoder.beam shouldBe StringUtf8Coder.of() - } + val sc = ScioContext() + val coll = sc.empty[String]() + // internal is wrapped + val internalCoder = coll.internal.getCoder + internalCoder shouldBe a[MaterializedCoder[_]] + val materializedCoder = internalCoder.asInstanceOf[MaterializedCoder[_]] + materializedCoder.bcoder shouldBe StringUtf8Coder.of() + // implicit SCollection coder is not + val scioCoder = coll.coder + scioCoder shouldBe a[Beam[_]] + val beamCoder = scioCoder.asInstanceOf[Beam[_]] + beamCoder.beam shouldBe StringUtf8Coder.of() } it should "propagates unwrapped nullable coders" in { - runWithContext { sc => - sc.optionsAs[ScioOptions].setNullableCoders(true) - - val coll = sc.empty[String]() - // internal is wrapped - val internalCoder = coll.internal.getCoder - internalCoder shouldBe a[MaterializedCoder[_]] - val materializedCoder = internalCoder.asInstanceOf[MaterializedCoder[_]] - materializedCoder.bcoder shouldBe a[NullableCoder[_]] - val nullableCoder = materializedCoder.bcoder.asInstanceOf[NullableCoder[_]] - nullableCoder.getValueCoder shouldBe StringUtf8Coder.of() - // implicit SCollection coder is not - val scioCoder = coll.coder - scioCoder shouldBe a[Beam[_]] - val beamCoder = scioCoder.asInstanceOf[Beam[_]] - beamCoder.beam shouldBe StringUtf8Coder.of() - } + val sc = ScioContext() + sc.optionsAs[ScioOptions].setNullableCoders(true) + val coll = sc.empty[String]() + // internal is wrapped + val internalCoder = coll.internal.getCoder + internalCoder shouldBe a[MaterializedCoder[_]] + val materializedCoder = internalCoder.asInstanceOf[MaterializedCoder[_]] + materializedCoder.bcoder shouldBe a[NullableCoder[_]] + val nullableCoder = materializedCoder.bcoder.asInstanceOf[NullableCoder[_]] + nullableCoder.getValueCoder shouldBe StringUtf8Coder.of() + // implicit SCollection coder is not + val scioCoder = coll.coder + scioCoder shouldBe a[Beam[_]] + val beamCoder = scioCoder.asInstanceOf[Beam[_]] + beamCoder.beam shouldBe StringUtf8Coder.of() } it should "support applyTransform()" in { diff --git a/site/src/main/paradox/Builtin.md b/site/src/main/paradox/Builtin.md new file mode 100644 index 0000000000..2d548c642b --- /dev/null +++ b/site/src/main/paradox/Builtin.md @@ -0,0 +1,446 @@ +# Built-in Functionality + +Scio is a thin wrapper on top of Beam offering idiomatic Scala APIs. Check out the [Beam Programming Guide](https://beam.apache.org/documentation/programming-guide/) first for a detailed explanation of the Beam programming model and concepts. + +## Basics + +- @scaladoc[ScioContext](com.spotify.scio.ScioContext) wraps Beam's @javadoc[Pipeline](org.apache.beam.sdk.Pipeline) +- @scaladoc[SCollection](com.spotify.scio.values.SCollection) wraps Beam's @javadoc[PCollection](org.apache.beam.sdk.values.PCollection) +- @scaladoc[ScioResult](com.spotify.scio.ScioResult) wraps Beam's @javadoc[PipelineResult](org.apache.beam.sdk.PipelineResult) + +See dedicated sections on: +- @ref[IO](io/index.md) +- @ref[Joins](Joins.md) +- @ref[Side Inputs](SideInputs.md) + +## Core functionality + +A `ScioContext` represents the pipeline and is the starting point for performing reads and the means by which the pipeline is executed. +Execute a pipeline by invoking @scaladoc[run](com.spotify.scio.ScioContext#run():com.spotify.scio.ScioExecutionContext) and await completion by chaining @scaladoc[waitUntilDone](com.spotify.scio.ScioExecutionContext#waitUntilDone(duration:scala.concurrent.duration.Duration,cancelJob:Boolean):com.spotify.scio.ScioResult): + +```scala mdoc:compile-only +import com.spotify.scio._ +val sc: ScioContext = ??? +sc.run().waitUntilDone() +``` + +`SCollection` is the representation of the data in a pipeline at a particular point in the execution graph preceding or following a transform. +`SCollection`s have many of the methods you would expect on a standard Scala collection: `map`, `filter`, `flatten`, `flatMap`, `reduce`, `collect`, `fold`, and `take`. + +Any `SCollection` of 2-tuples is considered a _keyed_ `SCollection` and the various @ref[joins](Joins.md) and `*ByKey` variants of other methods become available. +The first item in the tuple is considered the key and the second item the value. +The @scaladoc[keyBy](com.spotify.scio.values.SCollection#keyBy[K](f:T=%3EK)(implicitevidence$22:com.spotify.scio.coders.Coder[K]):com.spotify.scio.values.SCollection[(K,T)]) method creates a keyed `SCollection`, where the user-defined function extracts the key from the exising values: + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection + +val elements: SCollection[String] = ??? +val result: SCollection[(String, String)] = elements.keyBy(_.head.toString) +``` + +Once keyed, elements with the same key can be grouped so that they can be processed together: + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection + +val elements: SCollection[(String, String)] = ??? +val result: SCollection[(String, Iterable[String])] = elements.groupByKey +``` + +Distinct elements can be found with @scaladoc[distinct](com.spotify.scio.values.SCollection#distinct:com.spotify.scio.values.SCollection[T]) (or the `distinctBy` and `distinctByKey` variants): + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection + +val elements: SCollection[String] = ??? +val distinct: SCollection[String] = elements.distinct +``` + +Elements can be split into different `SCollection`s with @scaladoc[partition](com.spotify.scio.values.SCollection#partition(p:T=%3EBoolean):(com.spotify.scio.values.SCollection[T],com.spotify.scio.values.SCollection[T])), which can be useful for error handling. +Note that the number of partitions should be small. + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection + +val elements: SCollection[Int] = ??? +val (lessThanFive, greaterThanFive): (SCollection[Int], SCollection[Int]) = elements.partition(_ > 5) +``` + +`SCollection`s of the same type can be combined with a @scaladoc[union](com.spotify.scio.values.SCollection#union(that:com.spotify.scio.values.SCollection[T]):com.spotify.scio.values.SCollection[T]) (or +@scaladoc[unionAll](com.spotify.scio.ScioContext#unionAll[T](scs:=%3EIterable[com.spotify.scio.values.SCollection[T]])(implicitevidence$6:com.spotify.scio.coders.Coder[T]):com.spotify.scio.values.SCollection[T])) operation. + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection + +val a: SCollection[Int] = ??? +val b: SCollection[Int] = ??? +val elements: SCollection[Int] = a.union(b) +``` + +Elements can be printed to the console for inspection at any point of the graph by using @scaladoc[debug](com.spotify.scio.values.SCollection#debug(out:()=%3Ejava.io.PrintStream,prefix:String,enabled:Boolean):com.spotify.scio.values.SCollection[T]): + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection + +val elements: SCollection[String] = ??? +elements.debug(prefix = "myLabel: ") +``` + +## ContextAndArgs + +Scio's @scaladoc[ContextAndArgs](com.spotify.scio.ContextAndArgs) provides a convenient way to both parse command-line options and acquire a `ScioContext`: + +```scala mdoc:compile-only +import com.spotify.scio._ + +val cmdlineArgs: Array[String] = ??? +val (sc, args) = ContextAndArgs(cmdlineArgs) +``` + +If you need custom pipeline options, subclass Beam's @javadoc[PipelineOptions](org.apache.beam.sdk.options.PipelineOptions) and use `ContextAndArgs.typed`: + +```scala mdoc:compile-only +import com.spotify.scio._ +import org.apache.beam.sdk.options.PipelineOptions + +trait Arguments extends PipelineOptions { + def getMyArg: String + def setMyArg(input: String): Unit +} + +val cmdlineArgs: Array[String] = ??? +val (sc, args) = ContextAndArgs.typed[Arguments](cmdlineArgs) +val myArg: String = args.getMyArg +``` + +## Aggregations + +Scio provides a suite of built-in aggregations. +All `*ByKey` variants do the same as the normal function, but per-key for keyed `SCollection`s. + +### Counting + +* @scaladoc[count](com.spotify.scio.values.SCollection#count:com.spotify.scio.values.SCollection[Long]) (or `countByKey`) counts the number of elements +* @scaladoc[countByValue](com.spotify.scio.values.SCollection#countByValue:com.spotify.scio.values.SCollection[(T,Long)]) counts the number of elements for each value in a `SCollection[T]` +* @scaladoc[countApproxDistinct](com.spotify.scio.values.SCollection#countApproxDistinct(estimator:com.spotify.scio.estimators.ApproxDistinctCounter[T]):com.spotify.scio.values.SCollection[Long]) (or `countApproxDistinctByKey`) estimates a distinct count, with Beam's @javadoc[ApproximateUnique](org.apache.beam.sdk.transforms.ApproximateUnique) or Scio's HyperLogLog-based @scaladoc[ApproxDistinctCounter](com.spotify.scio.estimators.ApproxDistinctCounter) + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection +import com.spotify.scio.extra.hll.zetasketch.ZetaSketchHllPlusPlus + +val elements: SCollection[String] = ??? +val sketch = ZetaSketchHllPlusPlus[String]() +val result: SCollection[Long] = elements.countApproxDistinct(sketch) +``` + +### Statistics + +* @scaladoc[max](com.spotify.scio.values.SCollection#max(implicitord:Ordering[T]):com.spotify.scio.values.SCollection[T]) (or `maxByKey`) finds the maximum element given some @scaladoc[Ordering](scala.math.Ordering) +* @scaladoc[min](com.spotify.scio.values.SCollection#min(implicitord:Ordering[T]):com.spotify.scio.values.SCollection[T]) (or `minByKey`) finds the minimum element given some @scaladoc[Ordering](scala.math.Ordering) +* @scaladoc[mean](com.spotify.scio.values.SCollection#mean(implicitev:Numeric[T]):com.spotify.scio.values.SCollection[Double]) finds the mean given some @scaladoc[Numeric](scala.math.Numeric) +* @scaladoc[quantilesApprox](com.spotify.scio.values.SCollection#quantilesApprox(numQuantiles:Int)(implicitord:Ordering[T]):com.spotify.scio.values.SCollection[Iterable[T]]) (or `approxQuantilesByKey`) finds the distribution using Beam's @javadoc[ApproximateQuantiles](org.apache.beam.sdk.transforms.ApproximateQuantiles) + +For `SCollection`s containing `Double`, Scio additionally provides a @scaladoc[stats](com.spotify.scio.values.DoubleSCollectionFunctions#stats:com.spotify.scio.values.SCollection[com.spotify.scio.util.StatCounter]) method that computes the count, mean, min, max, variance, standard deviation, sample variance, and sample standard deviation over the `SCollection`. +Convenience methods are available directly on the @scaladoc[SCollection](com.spotify.scio.values.DoubleSCollectionFunctions) if only a single value is required: + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection +import com.spotify.scio.util.StatCounter + +val elements: SCollection[Double] = ??? + +val stats: SCollection[StatCounter] = elements.stats +val variance: SCollection[Double] = stats.map { s => s.variance } + +val stdev: SCollection[Double] = elements.stdev +``` + +### Sums & combinations + +@scaladoc[combine](com.spotify.scio.values.SCollection#combine[C](createCombiner:T=%3EC)(mergeValue:(C,T)=%3EC)(mergeCombiners:(C,C)=%3EC)(implicitevidence$15:com.spotify.scio.coders.Coder[C]):com.spotify.scio.values.SCollection[C]) (or `combineByKey`) combines elements with a set of user-defined functions: + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection + +case class A(count: Long, total: Long) +object A { + def apply(i: Int): A = A(1L, i) + def mergeValue(a: A, i: Int): A = A(a.count + 1L, a.total + i) + def mergeCombiners(a: A, b: A) = A(a.count + b.count, a.total + b.total) +} + +val elements: SCollection[Int] = ??? +elements.combine(A.apply)(A.mergeValue)(A.mergeCombiners) +``` + +@scaladoc[sum](com.spotify.scio.values.SCollection#sum(implicitsg:com.twitter.algebird.Semigroup[T]):com.spotify.scio.values.SCollection[T]) (or `sumByKey`) sums elements given a @scaladoc[Semigroup](com.twitter.algebird.Semigroup), while @scaladoc[aggregate](com.spotify.scio.values.SCollection#aggregate[A,U](aggregator:com.twitter.algebird.MonoidAggregator[T,A,U])(implicitevidence$12:com.spotify.scio.coders.Coder[A],implicitevidence$13:com.spotify.scio.coders.Coder[U]):com.spotify.scio.values.SCollection[U]) (or `aggregateByKey`) aggregates elements either with a set of user-defined functions, via a @scaladoc[Aggregator](com.twitter.algebird.Aggregator), or via a @scaladoc[MonoidAggregator](com.twitter.algebird.MonoidAggregator). + +Both `Semigroup` and `Monoid` instances can be derived with [magnolify](https://github.com/spotify/magnolify/blob/main/docs/derivation.md), assuming the behavior for the primitive types is what you expect. + +@@@ note + +Note that for `String` the default `Semigroup[String]` behavior is to append, which is usually not what you want. + +@@@ + +Fully-automatic derivation can be very concise but relies on some implicit magic: + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection +import com.twitter.algebird._ +import magnolify.cats.auto._ +import cats._ + +case class A(count: Long, total: Long) + +val elements: SCollection[A] = ??? + +val summed: SCollection[A] = elements.sum +val aggregated: SCollection[A] = elements.aggregate(Aggregator.fromMonoid[A]) +``` + +Semi-automatic derivation in a companion object may be more intelligible: + +```scala mdoc:compile-only +case class A(count: Long, total: Long) +object A { + import magnolify.cats.semiauto._ + import cats._ + implicit val aMonoid: cats.Monoid[A] = MonoidDerivation[A] +} +``` + +See also @ref[Algebird](extras/Algebird.md) + +## Metrics + +Scio supports Beam's @javadoc[Counter](org.apache.beam.sdk.metrics.Counter) @javadoc[Distribution](org.apache.beam.sdk.metrics.Distribution) and @javadoc[Gauge](org.apache.beam.sdk.metrics.Gauge). + +See @extref[MetricsExample](example:MetricsExample). + +## ScioResult + +@scaladoc[ScioResult](com.spotify.scio.ScioResult) can be used to access metric values, individually or as a group: + +```scala mdoc:compile-only +import com.spotify.scio._ +import org.apache.beam.sdk.metrics.{MetricName, Counter} + +val sc: ScioContext = ??? +val counter: Counter = ??? + +val sr: ScioResult = sc.run().waitUntilDone() +val counterValue: metrics.MetricValue[Long] = sr.counter(counter) +val counterMap: Map[MetricName, metrics.MetricValue[Long]] = sr.allCounters +``` + +## Taps & Materialization + +Writes return a @scaladoc[ClosedTap](com.spotify.scio.io.ClosedTap), which provides an interface to access the written results or pass them to a subsequent Scio job. + +```scala mdoc:compile-only +import com.spotify.scio._ +import com.spotify.scio.io.{Tap, ClosedTap} +import com.spotify.scio.values.SCollection + +val sc: ScioContext = ??? +val elements: SCollection[String] = ??? +val writeTap: ClosedTap[String] = elements.saveAsTextFile("gs://output-path") + +val sr: ScioResult = sc.run().waitUntilDone() + +val textTap: Tap[String] = sr.tap(writeTap) +val textContexts: Iterator[String] = textTap.value + +val sc2: ScioContext = ??? +val results: SCollection[String] = textTap.open(sc) +``` + +The same mechanism underlies Scio's @scaladoc[materialize](com.spotify.scio.values.SCollection#materialize:com.spotify.scio.io.ClosedTap[T]) method, which will save the contents of an `SCollection` at the point of the `materialize` to a temporary location and make them available after the pipeline completes: + +```scala mdoc:compile-only +import com.spotify.scio._ +import com.spotify.scio.io.{Tap, ClosedTap} +import com.spotify.scio.values.SCollection + +val sc: ScioContext = ??? +val elements: SCollection[String] = ??? +val materializeTap: ClosedTap[String] = elements.materialize + +val sr: ScioResult = sc.run().waitUntilDone() +val textTap: Tap[String] = sr.tap(materializeTap) +``` + +See also: @extref[WordCountOrchestration](example:WordCountOrchestration) example. + +## Use native Beam functionality + +If there is a need to use a Beam IO or transform for which Scio does not have an API, you can easily use the native Beam API for single steps in a pipeline otherwise written in Scio. + +@scaladoc[customInput](com.spotify.scio.ScioContext#customInput[T,I%3E:org.apache.beam.sdk.values.PBegin%3C:org.apache.beam.sdk.values.PInput](name:String,transform:org.apache.beam.sdk.transforms.PTransform[I,org.apache.beam.sdk.values.PCollection[T]]):com.spotify.scio.values.SCollection[T]) supports reading from a Beam source; any transform of type `PTransform[PBegin, PCollection[T]]`: + +```scala mdoc:compile-only +import com.spotify.scio._ +import com.spotify.scio.values.SCollection +import org.apache.beam.sdk.transforms.PTransform +import org.apache.beam.sdk.values.{PBegin, PCollection} +import org.apache.beam.sdk.io.TextIO + +val sc: ScioContext = ??? +val filePattern: String = ??? + +val textRead: PTransform[PBegin, PCollection[String]] = TextIO.read().from(filePattern) +val elements: SCollection[String] = sc.customInput("ReadText", textRead) +``` + +@scaladoc[saveAsCustomOutput](com.spotify.scio.values.SCollection#saveAsCustomOutput[O%3C:org.apache.beam.sdk.values.POutput](name:String,transform:org.apache.beam.sdk.transforms.PTransform[org.apache.beam.sdk.values.PCollection[T],O]):com.spotify.scio.io.ClosedTap[Nothing]) supports writing to a Beam sink; any transform of type `PTransform[PCollection[T], PDone]`: + +```scala mdoc:compile-only +import com.spotify.scio._ +import com.spotify.scio.values.SCollection +import org.apache.beam.sdk.transforms.PTransform +import org.apache.beam.sdk.values.{PDone, PCollection} +import org.apache.beam.sdk.io.TextIO + +val outputLocation: String = ??? +val elements: SCollection[String] = ??? +val textWrite: PTransform[PCollection[String], PDone] = TextIO.write().to(outputLocation) +elements.saveAsCustomOutput("WriteText", textWrite) +``` + +Finally, @scaladoc[applyTransform](com.spotify.scio.values.SCollection#applyTransform[U](transform:org.apache.beam.sdk.transforms.PTransform[_%3E:org.apache.beam.sdk.values.PCollection[T],org.apache.beam.sdk.values.PCollection[U]])(implicitevidence$2:com.spotify.scio.coders.Coder[U]):com.spotify.scio.values.SCollection[U]) supports using any Beam transform of type `PTransform[PCollection[T], PCollection[U]]`: + +```scala mdoc:compile-only +import com.spotify.scio._ +import com.spotify.scio.values.SCollection +import org.apache.beam.sdk.transforms.{PTransform, Sum} +import org.apache.beam.sdk.values.{PDone, PCollection} +import java.lang + +val elements: SCollection[Double] = ??? +val transform: PTransform[PCollection[lang.Double], PCollection[lang.Double]] = Sum.doublesGlobally +val result: SCollection[lang.Double] = elements + .map(Double.box) + .applyTransform(transform) +``` + +See also: @extref[BeamExample](example:BeamExample) + +## Windowing + +@scaladoc[timestampBy](com.spotify.scio.values.SCollection#timestampBy(f:T=%3Eorg.joda.time.Instant,allowedTimestampSkew:org.joda.time.Duration):com.spotify.scio.values.SCollection[T]) allows for changing an element's timestamp: + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection +import org.joda.time.Instant + +case class A(timestamp: Instant, value: String) +val elements: SCollection[A] = ??? +val timestamped: SCollection[A] = elements.timestampBy(_.timestamp) +``` + +The @scaladoc[withTimestamp](com.spotify.scio.values.SCollection#withTimestamp:com.spotify.scio.values.SCollection[(T,org.joda.time.Instant)]), @scaladoc[withWindow](com.spotify.scio.values.SCollection#withWindow[W%3C:org.apache.beam.sdk.transforms.windowing.BoundedWindow]:com.spotify.scio.values.SCollection[(T,W)]), and @scaladoc[withPaneInfo](com.spotify.scio.values.SCollection#withPaneInfo:com.spotify.scio.values.SCollection[(T,org.apache.beam.sdk.transforms.windowing.PaneInfo)]) functions flatten window metadata into the `SCollection`: + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection +import org.joda.time.Instant + +val elements: SCollection[String] = ??? +val timestamped: SCollection[(String, Instant)] = elements.withTimestamp +``` + +@scaladoc[toWindowed](com.spotify.scio.values.SCollection#toWindowed:com.spotify.scio.values.WindowedSCollection[T]) converts the `SCollection` to a @scaladoc[WindowedSCollection](com.spotify.scio.values.WindowedSCollection) whose elements are all instances of @scaladoc[WindowedValue](com.spotify.scio.values.WindowedValue), which gives full access to the windowing metadata: + +```scala mdoc:compile-only +import com.spotify.scio.values._ +import org.joda.time.Instant + +val elements: SCollection[String] = ??? +val windowed: WindowedSCollection[String] = elements.toWindowed +windowed.map { v: WindowedValue[String] => + v.withTimestamp(Instant.now()) +} +``` + +Scio provides convenience functions for the common types of windowing (@scaladoc[withFixedWindows](com.spotify.scio.values.SCollection#withFixedWindows(duration:org.joda.time.Duration,offset:org.joda.time.Duration,options:com.spotify.scio.values.WindowOptions):com.spotify.scio.values.SCollection[T]), @scaladoc[withSlidingWindows](com.spotify.scio.values.SCollection#withSlidingWindows(size:org.joda.time.Duration,period:org.joda.time.Duration,offset:org.joda.time.Duration,options:com.spotify.scio.values.WindowOptions):com.spotify.scio.values.SCollection[T]), @scaladoc[withSessionWindows](com.spotify.scio.values.SCollection#withSessionWindows(gapDuration:org.joda.time.Duration,options:com.spotify.scio.values.WindowOptions):com.spotify.scio.values.SCollection[T]), @scaladoc[withGlobalWindow](com.spotify.scio.values.SCollection#withGlobalWindow(options:com.spotify.scio.values.WindowOptions):com.spotify.scio.values.SCollection[T])) but also provides full control over the windowing with @scaladoc[withWindowFn](com.spotify.scio.values.SCollection#withWindowFn[W%3C:org.apache.beam.sdk.transforms.windowing.BoundedWindow](fn:org.apache.beam.sdk.transforms.windowing.WindowFn[_,W],options:com.spotify.scio.values.WindowOptions):com.spotify.scio.values.SCollection[T]). + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection +import org.joda.time.Duration + +val elements: SCollection[String] = ??? +val windowedElements: SCollection[String] = elements.withFixedWindows(Duration.standardHours(1)) +``` + +## Batching + +In cases where some transform performs better on a group of items, elements can be batched by number of elements with [`batch`](com.spotify.scio.values.SCollection#batch(batchSize:Long,maxLiveWindows:Int):com.spotify.scio.values.SCollection[Iterable[T]]), by the size of the elements with [`batchByteSized`](com.spotify.scio.values.SCollection#batchByteSized(batchByteSize:Long,maxLiveWindows:Int):com.spotify.scio.values.SCollection[Iterable[T]]), or by some user-defined weight with [`batchWeighted`](com.spotify.scio.values.SCollection#batchWeighted(batchWeight:Long,cost:T=%3ELong,maxLiveWindows:Int):com.spotify.scio.values.SCollection[Iterable[T]]). +There are also keyed variants of each of these: [`batchByKey`](com.spotify.scio.values.PairSCollectionFunctions#batchByKey(batchSize:Long,maxBufferingDuration:org.joda.time.Duration):com.spotify.scio.values.SCollection[(K,Iterable[V])]), [`batchByteSizedByKey`](com.spotify.scio.values.PairSCollectionFunctions#batchByteSizedByKey(batchByteSize:Long,maxBufferingDuration:org.joda.time.Duration):com.spotify.scio.values.SCollection[(K,Iterable[V])]), and [`batchWeightedByKey`](com.spotify.scio.values.PairSCollectionFunctions#batchWeightedByKey(weight:Long,cost:V=%3ELong,maxBufferingDuration:org.joda.time.Duration):com.spotify.scio.values.SCollection[(K,Iterable[V])]). + +```scala mdoc:compile-only +import com.spotify.scio._ +import com.spotify.scio.values.SCollection + +val elements: SCollection[String] = ??? +val batchedElements: SCollection[Iterable[String]] = elements.batch(10) +``` + +## Misc + +Some elements of an `SCollection` can be randomly sampled using @scaladoc[sample](com.spotify.scio.values.SCollection#sample(withReplacement:Boolean,fraction:Double):com.spotify.scio.values.SCollection[T]): + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection + +val elements: SCollection[String] = ??? +val result: SCollection[String] = elements.sample(withReplacement = true, fraction = 0.01) +``` + +The `SCollection` can be randomly split into new `SCollections` given a weighting of what fraction of the input should be in each split: + +```scala +import com.spotify.scio.values.SCollection + +val elements: SCollection[Int] = ??? +val weights: Array[Double] = Array(0.2, 0.6, 0.2) +val splits: Array[SCollection[Int]] = elements.randomSplit(weights) +``` + +The "top" _n_ elements of an `SCollection` given some @scaladoc[Ordering](scala.math.Ordering) can be found with @scaladoc[top](com.spotify.scio.values.SCollection#top(num:Int)(implicitord:Ordering[T]):com.spotify.scio.values.SCollection[Iterable[T]]): + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection + +val elements: SCollection[Int] = ??? +val top10: SCollection[Iterable[Int]] = elements.top(10) +``` + +The common elements of two `SCollections` can be found with @scaladoc[intersection](com.spotify.scio.values.SCollection#intersection(that:com.spotify.scio.values.SCollection[T]):com.spotify.scio.values.SCollection[T]): + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection + +val a: SCollection[String] = ??? +val b: SCollection[String] = ??? +val common: SCollection[String] = a.intersection(b) +``` + +For a keyed `SCollection`, @scaladoc[intersectByKey](com.spotify.scio.values.PairSCollectionFunctions#intersectByKey(rhs:com.spotify.scio.values.SCollection[K]):com.spotify.scio.values.SCollection[(K,V)]) will give the elements in the LHS whose keys are in the RHS: + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection + +val a: SCollection[(String, Int)] = ??? +val b: SCollection[String] = ??? +val common: SCollection[(String, Int)] = a.intersectByKey(b) +``` + +Similarly, @scaladoc[subtract](com.spotify.scio.values.SCollection#subtract(that:com.spotify.scio.values.SCollection[T]):com.spotify.scio.values.SCollection[T]) (or @scaladoc[subtractByKey](com.spotify.scio.values.PairSCollectionFunctions#subtractByKey(rhs:com.spotify.scio.values.SCollection[K]):com.spotify.scio.values.SCollection[(K,V)])) will give the elements in the LHS that are not present in the RHS: + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection + +val a: SCollection[String] = ??? +val b: SCollection[String] = ??? +val notInB: SCollection[String] = a.subtract(b) +``` diff --git a/site/src/main/paradox/FAQ.md b/site/src/main/paradox/FAQ.md index 1c035ef254..a434205d70 100644 --- a/site/src/main/paradox/FAQ.md +++ b/site/src/main/paradox/FAQ.md @@ -6,19 +6,17 @@ #### What's the status of Scio? -Scio is widely being used for production data pipelines at Spotify and is our preferred framework for building new pipelines on Google Cloud. We run Scio on [Google Cloud Dataflow](https://cloud.google.com/dataflow/) service in both batch and streaming modes. However it's still under heavy development and there might be minor breaking API changes from time to time. +Scio is widely being used for production data pipelines at Spotify and is our preferred framework for building new pipelines on Google Cloud. We run Scio on [Google Cloud Dataflow](https://cloud.google.com/dataflow/) service in both batch and streaming modes. It is still under development and there may be minor breaking API changes. #### Who's using Scio? Spotify uses Scio for all new data pipelines running on Google Cloud Platform, including music recommendation, monetization, artist insights and business analysis. We also use BigQuery, Bigtable and Datastore heavily with Scio. We use Scio in both batch and streaming mode. -As of mid 2017, there're 200+ developers and 700+ production pipelines. The largest batch job we've seen uses 800 n1-highmem-32 workers (25600 CPUs, 166.4TB RAM) and processes 325 billion rows from Bigtable (240TB). We also have numerous jobs that process 10TB+ of BigQuery data daily. On the streaming front, we have many jobs with 30+ n1-standard-16 workers (480 CPUs, 1.8TB RAM) and SSD disks for real time machine learning or reporting. - -For a incomplete list of users, see the [[Powered By]] page. +As of mid 2017, there are 200+ developers and 700+ production pipelines. The largest batch job we've seen uses 800 n1-highmem-32 workers (25600 CPUs, 166.4TB RAM) and processes 325 billion rows from Bigtable (240TB). We also have numerous jobs that process 10TB+ of BigQuery data daily. On the streaming front, we have many jobs with 30+ n1-standard-16 workers (480 CPUs, 1.8TB RAM) and SSD disks for real time machine learning or reporting. #### What's the relationship between Scio and Apache Beam? -Scio is a Scala API built on top of [Apache Beam](https://beam.apache.org/)'s Java SDK. Scio aims to offer a concise, idiomatic Scala API for a subset of Beam's features, plus extras we find useful, like REPL, type safe BigQuery, and IO taps. +Scio is a Scala API built on top of [Apache Beam](https://beam.apache.org/)'s Java SDK. Scio offers a concise, idiomatic Scala API for a subset of Beam's features, plus extras we find useful, like REPL, type safe BigQuery, and IO taps. #### What's the relationship between Scio and Google Cloud Dataflow? @@ -66,7 +64,7 @@ resolvers ++= Seq( #### How do I unit test pipelines? -Any Scala or Java unit testing frameworks can be used with Scio but we provide some utilities for [ScalaTest](http://www.scalatest.org/). +Any Scala or Java unit testing frameworks can be used with Scio, but we provide some utilities for [ScalaTest](http://www.scalatest.org/). - @scaladoc[PipelineTestUtils](com.spotify.scio.testing.PipelineTestUtils) - utilities for testing parts of a pipeline - @scaladoc[JobTest](com.spotify.scio.testing.JobTest$) - for testing pipelines end-to-end with complete arguments and IO coverage @@ -131,11 +129,11 @@ object MyJob { Scio exposes a few things to allow easy integration with native Beam Java API, notably: -- `ScioContext#customInput` to apply a `PTransform[_ >: PBegin, PCollection[T]]` (source) and get a `SCollection[T]`. -- `SCollection#applyTransform` to apply a `PTransform[_ >: PCollection[T], PCollection[U]]` and get a `SCollection[U]` +- `ScioContext#customInput` to apply a `PTransform[_ >: PBegin, PCollection[T]]` (source) and get an `SCollection[T]`. +- `SCollection#applyTransform` to apply a `PTransform[_ >: PCollection[T], PCollection[U]]` and get an `SCollection[U]` - `SCollection#saveAsCustomOutput` to apply a `PTransform[_ >: PCollection[T], PDone]` (sink) and get a `ClosedTap[T]`. -See @extref[BeamExample.scala](example:BeamExample) for more details. Custom I/O can also be tested via the @scaladoc[`JobTest`](com.spotify.scio.testing.JobTest$) harness. +See @extref[BeamExample](example:BeamExample) for more details. Custom I/O can also be tested via the @scaladoc[JobTest](com.spotify.scio.testing.JobTest$) harness. #### What are the different types of joins and performance implication? @@ -144,7 +142,7 @@ See @extref[BeamExample.scala](example:BeamExample) for more details. Custom I/O - Consider `skewedJoin` if some keys on the LHS are extremely hot. - Consider `sparseOuterJoin` if you want a full outer join where RHS is much smaller than LHS, but may not fit in memory. - Consider `cogroup` if you need to access value groups of each key. -- [`MultiJoin`](https://spotify.github.io/scio/api/com/spotify/scio/util/MultiJoin$.html) supports inner, left, outer join and cogroup of up to 22 inputs. +- @scaladoc[MultiJoin](com.spotify.scio.util.MultiJoin$) supports inner, left, outer join and cogroup of up to 22 inputs. - For multi-joins larger inputs should be on the left, e.g. `size(a) >= size(b) >= size(c) >= size(d)` in `MultiJoin(a, b, c, d)`. - Check out these [slides](http://www.lyh.me/slides/joins.html) for more information on joins. - Also see this section on [Cloud Dataflow Shuffle](https://cloud.google.com/dataflow/service/dataflow-service-desc#cloud-dataflow-shuffle) service. @@ -248,7 +246,7 @@ BigQuery doesn't provide a way to unit test query logic locally, but we can quer #### How do I stream to a partitioned BigQuery table? -Currently there is no way to create a [partitioned](https://cloud.google.com/bigquery/docs/partitioned-tables) BigQuery table via Scio/Beam when streaming, however it is possible to stream to a partitioned table if it is already created. +Currently, there is no way to create a [partitioned](https://cloud.google.com/bigquery/docs/partitioned-tables) BigQuery table via Scio/Beam when streaming, however it is possible to stream to a partitioned table if it is already created. This can be done by using fixed windows and using the window bounds to infer date. As of Scio 0.4.0-beta2 this looks as follows: @@ -303,9 +301,9 @@ In Scio 0.3.X it is possible to achieve the same behaviour using `SerializableFu Scio's @scaladoc[BigQuery client](com.spotify.scio.bigquery.client.BigQuery) in Scio caches query result in system property `bigquery.cache.directory`, which defaults to `$PWD/.bigquery`. Use `rm -rf .bigquery` to invalidate all cached results. To disable caching, set system property `bigquery.cache.enabled` to `false`. -#### How does BigQuery determines job priority? +#### How does BigQuery determine job priority? -By default Scio runs BigQuery jobs with `BATCH` priority except when in the REPL where it runs with `INTERACTIVE`. To override this, set system property `bigquery.priority` to either `BATCH` or `INTERACTIVE`. +By default, Scio runs BigQuery jobs with `BATCH` priority except when in the REPL where it runs with `INTERACTIVE`. To override this, set system property `bigquery.priority` to either `BATCH` or `INTERACTIVE`. ### Streaming questions @@ -348,7 +346,7 @@ def readme = FileSystems.open(readmeResource) This part is GCS specific. @@@ -You can get a @javadoc[`GcsUtil`](org.apache.beam.sdk.extensions.gcp.options.GcsOptions#getGcsUtil--) instance from `ScioContext`, which can be used to open GCS files in read or write mode. +You can get a @javadoc[GcsUtil](org.apache.beam.sdk.extensions.gcp.options.GcsOptions#getGcsUtil--) instance from `ScioContext`, which can be used to open GCS files in read or write mode. ```scala mdoc:reset:silent import com.spotify.scio.ContextAndArgs @@ -367,7 +365,7 @@ Datastore `Entity` class is actually generated from @github[Protobuf](/scio-exam #### How do I throttle Bigtable writes? -Currently Dataflow autoscaling may not work well with large writes BigtableIO. Specifically It does not take into account Bigtable IO rate limits and may scale up more workers and end up hitting the limit and eventually fail the job. As a workaround, you can enable throttling for Bigtable writes in Scio 0.4.0-alpha2 or later. +Currently, Dataflow autoscaling may not work well with large writes BigtableIO. Specifically It does not take into account Bigtable IO rate limits and may scale up more workers and end up hitting the limit and eventually fail the job. As a workaround, you can enable throttling for Bigtable writes in Scio 0.4.0-alpha2 or later. ```scala mdoc:reset:invisible val btProjectId = "" @@ -533,7 +531,7 @@ def main(cmdlineArgs: Array[String]): Unit = { #### What does "Cannot prove that T1 <:< T2" mean? -Sometimes you get an error message like `Cannot prove that T1 <:< T2` when saving an `SCollection`. This is because some sink methods have an implicit argument like this which means element type `T` of `SCollection[T]` must be a sub-type of `TableRow` in order to save it to BigQuery. You have to map out elements to the required type before saving. +Sometimes you get an error message like `Cannot prove that T1 <:< T2` when saving an `SCollection`. This is because some sink methods have an implicit argument like this which means element type `T` of `SCollection[T]` must be a subtype of `TableRow` in order to save it to BigQuery. You have to map out elements to the required type before saving. ```scala def saveAsBigQuery(tableSpec: String)(implicit ev: T <:< TableRow) @@ -617,7 +615,7 @@ There is multiple options here: #### How do I improve side input performance? -By default Dataflow workers allocate 100MB (see @javadoc[DataflowWorkerHarnessOptions#getWorkerCacheMb](org.apache.beam.runners.dataflow.options.DataflowWorkerHarnessOptions#getWorkerCacheMb--)) of memory for caching side inputs, and falls back to disk or network. Therefore jobs with large side inputs may be slow. To override this default, register `DataflowWorkerHarnessOptions` before parsing command line arguments and then pass `--workerCacheMb=N` when submitting the job. +By default, Dataflow workers allocate 100MB (see @javadoc[DataflowWorkerHarnessOptions#getWorkerCacheMb](org.apache.beam.runners.dataflow.options.DataflowWorkerHarnessOptions#getWorkerCacheMb--)) of memory for caching side inputs, and falls back to disk or network. Therefore jobs with large side inputs may be slow. To override this default, register `DataflowWorkerHarnessOptions` before parsing command line arguments and then pass `--workerCacheMb=N` when submitting the job. ```scala mdoc:reset:silent import com.spotify.scio._ @@ -633,7 +631,7 @@ def main(cmdlineArgs: Array[String]): Unit = { #### How do I control concurrency (number of DoFn threads) in Dataflow workers -By default Google Cloud Dataflow will use as many threads (concurrent DoFns) per worker as appropriate (precise definition is an implementation detail), in some cases you might want to control this. Use `NumberOfWorkerHarnessThreads` option from `DataflowPipelineDebugOptions`. For example to use a single thread per worker on 8 vCPU machine, simply specify 8 vCPU worker machine type, and `--numberOfWorkerHarnessThreads=1` in CLI or set corresponding option in `DataflowPipelineDebugOptions`. +By default, Google Cloud Dataflow will use as many threads (concurrent DoFns) per worker as appropriate (precise definition is an implementation detail), in some cases you might want to control this. Use `NumberOfWorkerHarnessThreads` option from `DataflowPipelineDebugOptions`. For example to use a single thread per worker on 8 vCPU machine, simply specify 8 vCPU worker machine type, and `--numberOfWorkerHarnessThreads=1` in CLI or set corresponding option in `DataflowPipelineDebugOptions`. #### How to manually investigate a Cloud Dataflow worker diff --git a/site/src/main/paradox/Getting-Started.md b/site/src/main/paradox/Getting-Started.md index 31b10e3554..f986eaffa6 100644 --- a/site/src/main/paradox/Getting-Started.md +++ b/site/src/main/paradox/Getting-Started.md @@ -34,7 +34,7 @@ sbt compile test:compile ## Running the Examples -You can execute the examples locally from SBT. By default pipelines will be executed using the @javadoc[`DirectRunner`](org.apache.beam.runners.direct.DirectRunner) and local filesystem will be used for input and output. Take a look at the @github[examples](/scio-examples/src/main/scala/com/spotify/scio/examples) to find out more. +You can execute the examples locally from SBT. By default pipelines will be executed using the @javadoc[DirectRunner](org.apache.beam.runners.direct.DirectRunner) and local filesystem will be used for input and output. Take a look at the @github[examples](/scio-examples/src/main/scala/com/spotify/scio/examples) to find out more. ``` neville@localhost scio $ sbt @@ -48,7 +48,7 @@ neville@localhost scio $ sbt Unlike Hadoop, Scio or Dataflow input should be file patterns and not directories, i.e. `gs://bucket/path/part-*.txt` and not `gs://bucket/path`. Output on the other hand should be directories just like Hadoop, so `gs://bucket/path` will produce files like `gs://bucket/path/part-00000-of-00005.txt`. @@@ -Use the @javadoc[`DataflowRunner`](org.apache.beam.runners.dataflow.DataflowRunner) to execute pipelines on Google Cloud Dataflow service using managed resources in the Google Cloud Platform. +Use the @javadoc[DataflowRunner](org.apache.beam.runners.dataflow.DataflowRunner) to execute pipelines on Google Cloud Dataflow service using managed resources in the Google Cloud Platform. ``` neville@localhost scio $ sbt @@ -118,9 +118,9 @@ The defaults should work well for most cases but we sometimes tune the following - `--workerDiskType` - specify SSD for jobs with really expensive shuffles. See a list of disk types [here](https://cloud.google.com/compute/docs/reference/latest/diskTypes). Also see this [page](https://cloud.google.com/compute/docs/disks/performance) about persistent disk size and type. - `--network` - specify this if you use VPN to communicate with external services, e.g. HDFS on an on-premise cluster. -More Dataflow pipeline specific options available can be found in @javadoc[`DataflowPipelineOptions`](org.apache.beam.runners.dataflow.options.DataflowPipelineOptions) and super interfaces. Some more useful ones are from @javadoc[`DataflowPipelineWorkerPoolOptions`](org.apache.beam.runners.dataflow.options.DataflowPipelineWorkerPoolOptions). +More Dataflow pipeline specific options available can be found in @javadoc[DataflowPipelineOptions](org.apache.beam.runners.dataflow.options.DataflowPipelineOptions) and super interfaces. Some more useful ones are from @javadoc[DataflowPipelineWorkerPoolOptions](org.apache.beam.runners.dataflow.options.DataflowPipelineWorkerPoolOptions). -@javadoc[`DataflowWorkerHarnessOptions#getWorkerCacheMb`](org.apache.beam.runners.dataflow.options.DataflowWorkerHarnessOptions#getWorkerCacheMb--) affects side input performance but needs an extra step to enable. See this @ref:[FAQ item](FAQ.md#how-do-i-improve-side-input-performance-). +@javadoc[DataflowWorkerHarnessOptions#getWorkerCacheMb](org.apache.beam.runners.dataflow.options.DataflowWorkerHarnessOptions#getWorkerCacheMb--) affects side input performance but needs an extra step to enable. See this @ref:[FAQ item](FAQ.md#how-do-i-improve-side-input-performance-). There are a few more experimental settings that might help specific scenarios: - `--experiments=shuffle_mode=service` - use external [shuffle service](https://cloud.google.com/dataflow/service/dataflow-service-desc#cloud-dataflow-shuffle) instead of local disk diff --git a/site/src/main/paradox/Joins.md b/site/src/main/paradox/Joins.md new file mode 100644 index 0000000000..9a29b080f7 --- /dev/null +++ b/site/src/main/paradox/Joins.md @@ -0,0 +1,214 @@ +# Joins + +Scio provides a full suite of join functionality and a few extras that solve tricky edge-cases in large-scale data processing. + +All joins operate over `SCollection`s containing 2-tuples, where the first tuple item is considered the _key_ and the second the _value_. +The order in which `SCollection`s are joined matters; larger datasets should be further to the left. +For example in `a.join(b)`, `a` should be the larger of the two datasets and by convention `a` is called the left-hand-side or _LHS_, while `b` is the right-hand-side or _RHS_. + +## Cogroup + +The Beam transform which underlies the standard joins below is the [Cogroup](https://beam.apache.org/documentation/programming-guide/#cogroupbykey). +Scio also provides a @scaladoc[cogroup](com.spotify.scio.values.PairSCollectionFunctions#cogroup[W](rhs:com.spotify.scio.values.SCollection[(K,W)]):com.spotify.scio.values.SCollection[(K,(Iterable[V],Iterable[W]))]) operation, which returns iterables from each `SCollection` containing all the values which match each key: + +```scala +import com.spotify.scio.values.SCollection + +val a: SCollection[(String, String)] = ??? +val b: SCollection[(String, Int)] = ??? +val elements: SCollection[(String, (Iterable[String], Iterable[Int]))] = a.cogroup(b) +``` + +## Standard joins + +Scio's standard joins have SQL-like names with SQL-like semantics. +In the examples below, the contents of the LHS are of type `(K, V)`, while the RHS are of type `(K, W)`. + +@scaladoc[join](com.spotify.scio.values.PairSCollectionFunctions#join[W](rhs:com.spotify.scio.values.SCollection[(K,W)]):com.spotify.scio.values.SCollection[(K,(V,W))]) produces elements of `(K, (V, W))`, where the key `K` must be in both the LHS and RHS: + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection + +val a: SCollection[(String, String)] = ??? +val b: SCollection[(String, Int)] = ??? +val elements: SCollection[(String, (String, Int))] = a.join(b) +``` + +@scaladoc[leftOuterJoin](com.spotify.scio.values.PairSCollectionFunctions#leftOuterJoin[W](rhs:com.spotify.scio.values.SCollection[(K,W)]):com.spotify.scio.values.SCollection[(K,(V,Option[W]))]) produces elements of `(K (V, Option[W]))`, where the key `K` is in the LHS but may not be in the RHS: + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection + +val a: SCollection[(String, String)] = ??? +val b: SCollection[(String, Int)] = ??? +val elements: SCollection[(String, (String, Option[Int]))] = a.leftOuterJoin(b) +``` + +@scaladoc[rightOuterJoin](com.spotify.scio.values.PairSCollectionFunctions#rightOuterJoin[W](rhs:com.spotify.scio.values.SCollection[(K,W)]):com.spotify.scio.values.SCollection[(K,(Option[V],W))]) produces elements of `(K, (Option[V], W]))`, where the key `K` is in the RHS but may not be in the LHS: + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection + +val a: SCollection[(String, String)] = ??? +val b: SCollection[(String, Int)] = ??? +val elements: SCollection[(String, (Option[String], Int))] = a.rightOuterJoin(b) +``` + +@scaladoc[fullOuterJoin](com.spotify.scio.values.PairSCollectionFunctions#fullOuterJoin[W](rhs:com.spotify.scio.values.SCollection[(K,W)]):com.spotify.scio.values.SCollection[(K,(Option[V],Option[W]))]) produces elements of `(K (Option[V], Option[W]))`, where the key `K` can be in either side: + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection + +val a: SCollection[(String, String)] = ??? +val b: SCollection[(String, Int)] = ??? +val elements: SCollection[(String, (Option[String], Option[Int]))] = a.fullOuterJoin(b) +``` + +When multiple joins of the same type are chained, it is more efficient to use Scio's @scaladoc[MultiJoin](com.spotify.scio.util.MultiJoin) class. Instead of `a.join(b).join(c)` prefer `MultiJoin` (or its variants, `MultiJoin.left`, `MultiJoin.outer`, `MultiJoin.cogroup`): + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection +import com.spotify.scio.util.MultiJoin + +val a: SCollection[(String, Int)] = ??? +val b: SCollection[(String, Boolean)] = ??? +val c: SCollection[(String, Float)] = ??? +val elements: SCollection[(String, (Int, Boolean, Float))] = MultiJoin(a, b, c) +``` + +## Hash joins + +Scio's @scaladoc[hashJoin](com.spotify.scio.values.PairHashSCollectionFunctions#hashJoin[W](rhs:com.spotify.scio.values.SCollection[(K,W)]):com.spotify.scio.values.SCollection[(K,(V,W))]) and variants @scaladoc[hashLeftOuterJoin](com.spotify.scio.values.PairHashSCollectionFunctions#hashLeftOuterJoin[W](sideInput:com.spotify.scio.values.SideInput[Map[K,Iterable[W]]]):com.spotify.scio.values.SCollection[(K,(V,Option[W]))]), and @scaladoc[hashFullOuterJoin](com.spotify.scio.values.PairHashSCollectionFunctions#hashFullOuterJoin[W](rhs:com.spotify.scio.values.SCollection[(K,W)]):com.spotify.scio.values.SCollection[(K,(Option[V],Option[W]))]) provide a convenient syntax over the top of Beam's SideInput class to avoid shuffle during the join. +The RHS should fit in memory, as with normal @ref[SideInputs](SideInputs.md). + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection + +val a: SCollection[(String, Int)] = ??? +val b: SCollection[(String, Boolean)] = ??? +val elements: SCollection[(String, (Int, Boolean))] = a.hashJoin(b) +``` + +In the less-common case where the LHS contains only keys to be looked-up, @scaladoc[hashLookup](com.spotify.scio.values.SCollection#hashLookup[V](that:com.spotify.scio.values.SCollection[(T,V)]):com.spotify.scio.values.SCollection[(T,Iterable[V])]) will join in all matching values from the RHS. +Again, the RHS should fit in memory. + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection + +val a: SCollection[String] = ??? +val b: SCollection[(String, String)] = ??? +val elements: SCollection[(String, Iterable[String])] = a.hashLookup(b) +``` + +In addition, Scio also provides the shuffle-free intersection and subtraction operations @scaladoc[hashIntersectByKey](com.spotify.scio.values.PairHashSCollectionFunctions#hashIntersectByKey(rhs:com.spotify.scio.values.SCollection[K]):com.spotify.scio.values.SCollection[(K,V)]) and @scaladoc[hashSubtractByKey](com.spotify.scio.values.PairHashSCollectionFunctions#hashSubtractByKey(sideInput:com.spotify.scio.values.SideInput[Set[K]]):com.spotify.scio.values.SCollection[(K,V)]). + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection + +val a: SCollection[(String, Int)] = ??? +val b: SCollection[String] = ??? + +val subtracted: SCollection[(String, Int)] = a.hashSubtractByKey(b) +val intersected: SCollection[(String, Int)] = a.hashIntersectByKey(b) +``` + +## Large hash join + +Similar to Hash Joins, Scio's @scaladoc[largeHashJoin](com.spotify.scio.extra.sparkey.PairLargeHashSCollectionFunctions#largeHashJoin[W](rhs:com.spotify.scio.values.SCollection[(K,W)],numShards:Short,compressionType:com.spotify.sparkey.CompressionType,compressionBlockSize:Int):com.spotify.scio.values.SCollection[(K,(V,W))]) and variants @scaladoc[largeHashLeftOuterJoin](com.spotify.scio.extra.sparkey.PairLargeHashSCollectionFunctions#largeHashLeftOuterJoin[W](rhs:com.spotify.scio.values.SCollection[(K,W)],numShards:Short,compressionType:com.spotify.sparkey.CompressionType,compressionBlockSize:Int):com.spotify.scio.values.SCollection[(K,(V,Option[W]))]) and @scaladoc[largeHashFullOuterJoin](com.spotify.scio.extra.sparkey.PairLargeHashSCollectionFunctions#largeHashFullOuterJoin[W](rhs:com.spotify.scio.values.SCollection[(K,W)],numShards:Short,compressionType:com.spotify.sparkey.CompressionType,compressionBlockSize:Int):com.spotify.scio.values.SCollection[(K,(Option[V],Option[W]))]) provide a convenient syntax on top of Scio's @ref[Sparkey](extras/Sparkey.md) support to avoid shuffle during a join. +Use of sparkey requires only that the RHS fit on disk. + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection +import com.spotify.scio.extra.sparkey._ + +val a: SCollection[(String, Int)] = ??? +val b: SCollection[(String, Boolean)] = ??? +val elements: SCollection[(String, (Int, Boolean))] = a.largeHashJoin(b) +``` + +Larger shuffle-free intersection and subtraction operations are also provided as @scaladoc[largeHashIntersectByKey](com.spotify.scio.extra.sparkey.PairLargeHashSCollectionFunctions#largeHashIntersectByKey(rhs:com.spotify.scio.values.SCollection[K],numShards:Short,compressionType:com.spotify.sparkey.CompressionType,compressionBlockSize:Int):com.spotify.scio.values.SCollection[(K,V)]) and @scaladoc[largeHashSubtractByKey](com.spotify.scio.extra.sparkey.PairLargeHashSCollectionFunctions#largeHashSubtractByKey(rhs:com.spotify.scio.values.SCollection[K],numShards:Short,compressionType:com.spotify.sparkey.CompressionType,compressionBlockSize:Int):com.spotify.scio.values.SCollection[(K,V)]). + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection +import com.spotify.scio.extra.sparkey._ + +val a: SCollection[(String, Int)] = ??? +val b: SCollection[String] = ??? + +val subtracted: SCollection[(String, Int)] = a.largeHashSubtractByKey(b) +val intersected: SCollection[(String, Int)] = a.largeHashIntersectByKey(b) +``` + +## Sparse join + +Scio supports a 'sparse join' for cases where both the LHS and RHS of a join are large, but where the keys in the RHS cover a relatively small number of rows in the LHS. + +In this case, an optimization of the join can significantly reduce the shuffle. +The keys of the RHS are inserted into a [Bloom Filter](https://en.wikipedia.org/wiki/Bloom_filter), a probabilistic data structure that effectively acts as a `Set` but with some risk of false positives. +Elements in the LHS dataset are partitioned by passing an element's key through the filter and splitting the dataset on whether the key is found or not. +All LHS keys which are found in the filter are _probably_ in the RHS dataset, so a full join is performed on these elements. +Any LHS key _not found_ in the filter are _definitely not_ in the RHS dataset, so these items can be handled without a join. +To properly size the Bloom filter, an estimate of the number of keys in the RHS (`rhsNumKeys`) must be provided to the join function. + +In addition to @scaladoc[sparseJoin](com.spotify.scio.values.PairSCollectionFunctions#sparseJoin[W](rhs:com.spotify.scio.values.SCollection[(K,W)],rhsNumKeys:Long,fpProb:Double)(implicitfunnel:com.google.common.hash.Funnel[K]):com.spotify.scio.values.SCollection[(K,(V,W))]) (and variants @scaladoc[sparseLeftOuterJoin](com.spotify.scio.values.PairSCollectionFunctions#sparseLeftOuterJoin[W](rhs:com.spotify.scio.values.SCollection[(K,W)],rhsNumKeys:Long,fpProb:Double)(implicitfunnel:com.google.common.hash.Funnel[K]):com.spotify.scio.values.SCollection[(K,(V,Option[W]))]), @scaladoc[sparseRightOuterJoin](com.spotify.scio.values.PairSCollectionFunctions#sparseRightOuterJoin[W](rhs:com.spotify.scio.values.SCollection[(K,W)],rhsNumKeys:Long,fpProb:Double)(implicitfunnel:com.google.common.hash.Funnel[K]):com.spotify.scio.values.SCollection[(K,(Option[V],W))]), and @scaladoc[sparseFullOuterJoin](com.spotify.scio.values.PairSCollectionFunctions#sparseFullOuterJoin[W](rhs:com.spotify.scio.values.SCollection[(K,W)],rhsNumKeys:Long,fpProb:Double)(implicitfunnel:com.google.common.hash.Funnel[K]):com.spotify.scio.values.SCollection[(K,(Option[V],Option[W]))])) Scio also provides a @scaladoc[sparseIntersectByKey](com.spotify.scio.values.PairSCollectionFunctions#sparseIntersectByKey(rhs:com.spotify.scio.values.SCollection[K],rhsNumKeys:Long,computeExact:Boolean,fpProb:Double)(implicitfunnel:com.google.common.hash.Funnel[K]):com.spotify.scio.values.SCollection[(K,V)]) implementation. +Scio uses Guava's @javadoc[BloomFilter](com.google.common.hash.BloomFilter). +Import `magnolify.guava.auto._` to get common instances of Guava @javadoc[Funnel](com.google.common.hash.Funnel): + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection +import magnolify.guava.auto._ + +val a: SCollection[(String, Int)] = ??? +val b: SCollection[(String, Boolean)] = ??? +val c: SCollection[String] = ??? + +val bNumKeys: Int = ??? +val joined = a.sparseJoin(b, bNumKeys) + +val cNumKeys: Int = ??? +val intersected: SCollection[(String, Int)] = a.sparseIntersectByKey(c, cNumKeys) +``` + +Finally, Scio provides @scaladoc[sparseLookup](com.spotify.scio.values.PairSCollectionFunctions#sparseLookup[A](rhs:com.spotify.scio.values.SCollection[(K,A)],thisNumKeys:Long)(implicitfunnel:com.google.common.hash.Funnel[K]):com.spotify.scio.values.SCollection[(K,(V,Iterable[A]))]), a special-case for joining all items from the RHS with matching keys into the LHS items with that key. +Differently than the other `sparse` variants, in this case an estimate of the number of keys in the LHS must be provided: + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection +import magnolify.guava.auto._ + +val a: SCollection[(String, Int)] = ??? +val b: SCollection[(String, String)] = ??? + +val aNumKeys: Int = ??? +val lookedUp: SCollection[(String, (Int, Iterable[String]))] = a.sparseLookup(b, aNumKeys) +``` + +## Skewed Join + +Similar to sparse joins, Scio supports a 'skewed join' for the special case in which some keys in a dataset are very frequent, or _hot_. + +Scio uses a [Count-Min sketch](https://en.wikipedia.org/wiki/Count%E2%80%93min_sketch) (or _CMS_), a probabilistic data structure that is internally similar to a Bloom filter, but which provides an estimated count for a given item which is explicitly an _over_estimate. +The keys of the LHS are counted and those which exceed the value of the `hotKeyThreshold` parameter (default: `9000`) plus an error bound are considered 'hot', while any remaining key is 'cold'. +Both the LHS and RHS are divided into 'hot' and 'chill' partitions. +The chill sides are joined normally, while the hot side of the RHS is `hashJoin`ed into the hot LHS, avoiding shuffle on this segment of the dataset. + +Scio provides @scaladoc[skewedJoin](com.spotify.scio.values.PairSkewedSCollectionFunctions), @scaladoc[skewedLeftOuterJoin](com.spotify.scio.values.PairSkewedSCollectionFunctions), and @scaladoc[skewedFullOuterJoin](com.spotify.scio.values.PairSkewedSCollectionFunctions) variants. +Import `com.twitter.algebird.CMSHasherImplicits._` for the implicits required for count-min sketch. + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection +import com.twitter.algebird.CMSHasherImplicits._ + +val a: SCollection[(String, Int)] = ??? +val b: SCollection[(String, String)] = ??? +val elements: SCollection[(String, (Int, String))] = a.skewedJoin(b) +``` + +## Sort Merge Bucket (SMB) + +Sort-Merge Buckets allow for shuffle-free joins of large datasets. +See @ref[Sort Merge Bucket](extras/Sort-Merge-Bucket.md) + +## See also + +* [Join Optimizations at Spotify](https://youtu.be/cGvaQp_h5ek?t=6257) How Scio can save you time and money with clever join strategies and approximate algorithms, Apache Beam Summit 2022 diff --git a/site/src/main/paradox/Powered-By.md b/site/src/main/paradox/Powered-By.md deleted file mode 100644 index 8ddaff274c..0000000000 --- a/site/src/main/paradox/Powered-By.md +++ /dev/null @@ -1,22 +0,0 @@ -# Powered By - -Here is a list of organizations using Scio in production. - -| **Organization** | **Use Case** | **Code** | -|:------------|:----------------------|:---------| -| [Spotify](https://www.spotify.com/) | Everything including music recommendation, monetization, artist insights and business analysis. We also use BigQuery, Bigtable and Datastore heavily with Scio. | [Big Data Rosetta Code](https://github.com/spotify/big-data-rosetta-code), [Ratatool](https://github.com/spotify/ratatool), [Featran](https://github.com/spotify/featran) | -| [Big Viking Games](https://www.bigvikinggames.com/) | Streaming event collection and ETL using Pub/Sub and BigQuery | | -| [Algolia](https://www.algolia.com/) | Log collection and analytics using Bigtable, Cloud Storage & Pub/sub | | -| [Hypefactors](https://www.hypefactors.com/) | Natural language processing / media monitoring. Also using PubSub, GCS and ElasticSearch with Scio. | | -| [Discord](https://discordapp.com/) | Streaming event collection, sessionization, and enrichment using Pub/Sub, BigQuery, and Bigtable. | | -| [Dow Jones](https://www.dowjones.com/)| Streaming article events, bulk article extractions and ETL using Pub/Sub, GCS and BigQuery for the [DNA Platform](https://www.dowjones.com/dna).| | -| [Honey](https://www.joinhoney.com) | Streaming ETL data pipeline from Pub/Sub to Pub/Sub, BigTable, BigQuery. | | -| [Cabify](https://www.cabify.com) | Streaming data pipelines from Pub/Sub to BigQuery | | -| [Jobrapido](https://www.jobrapido.com) | Streaming and Batch ETL using Pub/Sub and BigQuery | | -| [9GAG](https://9gag.com) | Streaming and Batch ETL using Pub/Sub | | -| [Cityblock](https://www.cityblock.com) | Streaming and Batch ETL using Pub/Sub, BigQuery, and Datastore | | -| [Arquivei](https://www.arquivei.com.br) | Streaming and Batch ETL using Pub/Sub, BigQuery, GCS, S3, and ElasticSearch | | -| [Vpon](http://www.vpon.com/en/) | Batch ETL and BigQuery | | -| [Snowplow Analytics](https://snowplowanalytics.com/) | Streaming ETL | [Beam Enrich](https://github.com/snowplow/snowplow/tree/master/3-enrich/beam-enrich), [BigQuery Loader](https://github.com/snowplow-incubator/snowplow-bigquery-loader) and [Cloud Storage Loader](https://github.com/snowplow-incubator/snowplow-google-cloud-storage-loader) | -| [Quibi](https://quibi.com/) | Streaming and Batch ETL from Pub/Sub and GCS to BigQuery | | -| [Ztore](https://www.ztore.com/) | Streaming ETL and event-driven application, using Pub/Sub, BigQuery, Kafka, MongoDB, AWS SQS, and DynamoDB | | diff --git a/site/src/main/paradox/Scio,-Beam-and-Dataflow.md b/site/src/main/paradox/Scio,-Beam-and-Dataflow.md deleted file mode 100644 index 83a117d1a5..0000000000 --- a/site/src/main/paradox/Scio,-Beam-and-Dataflow.md +++ /dev/null @@ -1,45 +0,0 @@ -# Scio, Beam and Dataflow - -Check out the [Beam Programming Guide](https://beam.apache.org/documentation/programming-guide/) first for a detailed explanation of the Beam programming model and concepts. Also see this comparison between [[Scio, Scalding and Spark]] APIs. - -Scio aims to be a thin wrapper on top of Beam while offering idiomatic Scala style API. - -## Basics - -- @scaladoc[`ScioContext`](com.spotify.scio.ScioContext) wraps @javadoc[`Pipeline`](org.apache.beam.sdk.Pipeline) -- @scaladoc[`SCollection`](com.spotify.scio.values.SCollection) wraps @javadoc[`PCollection`](org.apache.beam.sdk.values.PCollection) -- @scaladoc[`ScioResult`](com.spotify.scio.ScioResult) wraps @javadoc[`PipelineResult`](org.apache.beam.sdk.PipelineResult) -- Most @javadoc[`PTransform`](org.apache.beam.sdk.transforms.PTransform) are implemented as idiomatic Scala methods on `SCollection` e.g. `map`, `flatMap`, `filter`, `reduce`. -- @scaladoc[`PairSCollectionFunctions`](com.spotify.scio.values.PairSCollectionFunctions) and @scaladoc[`DoubleSCollectionFunctions`](com.spotify.scio.values.DoubleSCollectionFunctions) are specialized version of `SCollection` implemented via the Scala "[pimp my library](https://coderwall.com/p/k_1jzw/scala-s-pimp-my-library-pattern-example)" pattern. -- An `SCollection[(K, V)]` is automatically converted to a `PairSCollectionFunctions` which provides key-value operations, e.g. `groupByKey`, `reduceByKey`, `cogroup`, `join`. -- An `SCollection[Double]` is automatically converted to a `DoubleSCollectionFunctions` which provides statistical operations, e.g. `stddev`, `variance`. - -## ScioContext, PipelineOptions, Args and ScioResult - -- Beam/Dataflow uses @javadoc[`PipelineOptions`](org.apache.beam.sdk.options.PipelineOptions) and its subclasses to parse command line arguments. Users have to extend the interface for their application level arguments. -- Scalding uses [`Args`](https://twitter.github.io/scalding/api/#com.twitter.scalding.Args) to parse application arguments in a more generic and boilerplate free style. -- `ScioContext` has a `parseArguments` method that takes an `Array[String]` of command line arguments, parses Beam/Dataflow specific ones into a `PipelineOptions`, and application specific ones into an `Args`, and returns the `(PipelineOptions, Args)`. -- `ContextAndArgs` is a short cut to create a `(ScioContext, Args)`. -- `ScioResult` can be used to access accumulator values and job state. - -## IO - -- Most @javadoc[`IO`](org.apache.beam.sdk.io.package-summary) Read transforms are implemented as methods on `ScioContext`, e.g. `avroFile`, `textFile`, `bigQueryTable`. -- Most `IO` Write transforms are implemented as methods on `SCollection`, e.g. `saveAsAvroFile`, `saveAsTextFile`, `saveAsBigQueryTable`. -- These IO operations also detects when the `ScioContext` is running in a @scaladoc[`JobTest`](com.spotify.scio.testing.JobTest$) and manages test IO in memory. -- Write options also return a @scaladoc[`ClosedTap`](com.spotify.scio.io.ClosedTap). Once the job completes you can open the @scaladoc[`Tap`](com.spotify.scio.io.Tap). `Tap` abstracts away the logic of reading the dataset directly as an `Iterator[T]` or re-opening it in another `ScioContext`. The `Future` is complete once the job finishes. This can be used to do light weight pipeline orchestration e.g. @extref[WordCountOrchestration.scala](example:WordCountOrchestration). - -## ByKey operations - -- Beam/Dataflow `ByKey` transforms require `PCollection[KV[K, V]]` inputs while Scio uses `SCollection[(K, V)]` -- Hence every `ByKey` transform in `PairSCollectionFunctions` converts Scala `(K, V)` to `KV[K, V]` before and vice versa afterwards. However these are lightweight wrappers and the JVM should be able to optimize them. -- `PairSCollectionFunctions` also converts `java.lang.Iterable[V]` and `java.util.List[V]` to `scala.Iterable[V]` in some cases. - -## Coders - -- Beam/Dataflow uses @javadoc[`Coder`](org.apache.beam.sdk.coders.Coder) for (de)serializing elements in a `PCollection` during shuffle. There are built-in coders for Java primitive types, collections, and common types in GCP like Avro, ProtoBuf, BigQuery `TableRow`, Datastore `Entity`. -- `PCollection` uses [`TypeToken`](https://google.github.io/guava/releases/snapshot/api/docs/com/google/common/reflect/TypeToken.html) from [Guava reflection](https://github.com/google/guava/wiki/ReflectionExplained) and @javodoc[`TypeDescriptor`](org.apache.beam.sdk.values.TypeDescriptor) to workaround Java type erasure and retrieve type information of elements. This may not always work but there is a `PCollection#setCoder` method to override. -- Twitter's [chill](https://github.com/twitter/chill) library uses [kryo](https://github.com/EsotericSoftware/kryo) to (de)serialize data. Chill includes serializers for common Scala types and cal also automatically derive serializers for arbitrary objects. Scio falls back to @github[`KryoAtomicCoder`](/scio-core/src/main/scala/com/spotify/scio/coders/KryoAtomicCoder.scala) when a built-in one isn't available. -- A coder may be non-deterministic if `Coder#verifyDeterministic` throws an exception. Any data type with such a coder cannot be used as a key in `ByKey` operations. However `KryoAtomicCoder` assumes all types are deterministic for simplicity so it's up to the user's discretion to not avoid non-deterministic types e.g. tuples or case classes with doubles as keys. -- Avro [`GenericRecord`](https://avro.apache.org/docs/current/api/java/org/apache/avro/generic/GenericRecord.html) requires a schema during deserialization (which is available as `GenericRecord#getSchema` for serialization) and @javadoc[`AvroCoder`](org.apache.beam.sdk.coders.AvroCoder) requires that too during initialization. This is not possible in `KryoAtomicCoder`, i.e. when nesting `GenericRecord` inside a Scala type. Instead `KryoAtomicCoder` serializes the schema before every record so that they can roundtrip safely. This is not optimal but the only way without requiring user to handcraft a custom coder. - diff --git a/site/src/main/paradox/Scio,-Scalding-and-Spark.md b/site/src/main/paradox/Scio,-Scalding-and-Spark.md index 5a3c8c6012..5d4353fc38 100644 --- a/site/src/main/paradox/Scio,-Scalding-and-Spark.md +++ b/site/src/main/paradox/Scio,-Scalding-and-Spark.md @@ -1,6 +1,6 @@ # Scio, Spark and Scalding -Check out the [Beam Programming Guide](https://beam.apache.org/documentation/programming-guide/) first for a detailed explanation of the Beam programming model and concepts. Also read more about the relationship between [[Scio, Beam and Dataflow]]. +Check out the [Beam Programming Guide](https://beam.apache.org/documentation/programming-guide/) first for a detailed explanation of the Beam programming model and concepts. Scio's API is heavily influenced by Spark with a lot of ideas from Scalding. @@ -10,11 +10,11 @@ The Dataflow programming model is fundamentally different from that of Spark. Re The Scio API is heavily influenced by Spark but there are some minor differences. -- @scaladoc[`SCollection`](com.spotify.scio.values.SCollection) is equivalent to Spark's [`RDD`](https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.rdd.RDD). -- @scaladoc[`PairSCollectionFunctions`](com.spotify.scio.values.PairSCollectionFunctions) and @scaladoc[`DoubleSCollectionFunctions`](com.spotify.scio.values.DoubleSCollectionFunctions) are specialized versions of `SCollection` and equivalent to Spark's [`PairRDDFunctions`](https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.rdd.PairRDDFunctions) and [`DoubleRDDFunctions`](https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.rdd.DoubleRDDFunctions). +- @scaladoc[SCollection](com.spotify.scio.values.SCollection) is equivalent to Spark's [`RDD`](https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.rdd.RDD). +- @scaladoc[PairSCollectionFunctions](com.spotify.scio.values.PairSCollectionFunctions) and @scaladoc[DoubleSCollectionFunctions](com.spotify.scio.values.DoubleSCollectionFunctions) are specialized versions of `SCollection` and equivalent to Spark's [`PairRDDFunctions`](https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.rdd.PairRDDFunctions) and [`DoubleRDDFunctions`](https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.rdd.DoubleRDDFunctions). - Execution planning is static and happens before the job is submitted. There is no driver node in a Dataflow cluster and one can only perform the equivalent of Spark [_transformations_](http://spark.apache.org/docs/latest/programming-guide.html#transformations) (`RDD` → `RDD`) but not [_actions_](http://spark.apache.org/docs/latest/programming-guide.html#actions) (`RDD` → driver local memory). - There is no [_broadcast_](http://spark.apache.org/docs/latest/programming-guide.html#broadcast-variables) either but the pattern of `RDD` → driver via _action_ and driver → `RDD` via _broadcast_ can be replaced with `SCollection.asSingletonSideInput` and `SCollection.withSideInputs`. -- There is no [`DStream`](https://spark.apache.org/docs/latest/streaming-programming-guide.html#discretized-streams-dstreams) (continuous series of `RDD`s) like in Spark Streaming. Values in a `SCollection` are windowed based on timestamp and windowing operation. The same API works regardless of batch (single global window by default) or streaming mode. Aggregation type _transformations_ that produce `SCollection`s of a single value under global window will produce one value each window when a non-global window is defined. +- There is no [`DStream`](https://spark.apache.org/docs/latest/streaming-programming-guide.html#discretized-streams-dstreams) (continuous series of `RDD`s) like in Spark Streaming. Values in an `SCollection` are windowed based on timestamp and windowing operation. The same API works regardless of batch (single global window by default) or streaming mode. Aggregation type _transformations_ that produce `SCollection`s of a single value under global window will produce one value each window when a non-global window is defined. - `SCollection` has extra methods for side input, side output, and windowing. ## Scio and Scalding @@ -29,24 +29,24 @@ Scio has a much simpler abstract data types compared to Scalding. Some features may look familiar to Scalding users. -- @scaladoc[`Args`](com.spotify.scio.Args) is a simple command line argument parser similar to the one in Scalding. +- @scaladoc[Args](com.spotify.scio.Args) is a simple command line argument parser similar to the one in Scalding. - Powerful transforms are possible with `sum`, `sumByKey`, `aggregate`, `aggregrateByKey` using [Algebird](https://github.com/twitter/algebird) `Semigroup`s and `Aggregator`s. -- @scaladoc[`MultiJoin`](com.spotify.scio.util.MultiJoin$) and coGroup of up to 22 sources. -- @scaladoc[`JobTest`](com.spotify.scio.testing.JobTest$) for end to end pipeline testing. +- @scaladoc[MultiJoin](com.spotify.scio.util.MultiJoin$) and coGroup of up to 22 sources. +- @scaladoc[JobTest](com.spotify.scio.testing.JobTest$) for end to end pipeline testing. ## SCollection `SCollection` has a few variations. -- @scaladoc[`SCollectionWithSideInput`](com.spotify.scio.values.SCollectionWithSideInput) for replicating small `SCollection`s to all left-hand side values in a large `SCollection`. -- @scaladoc[`SCollectionWithSideOutput`](com.spotify.scio.values.SCollectionWithSideOutput) for output to multiple SCollections. -- @scaladoc[`WindowedSCollection`](com.spotify.scio.values.WindowedSCollection) for accessing window information. -- @scaladoc[`SCollectionWithFanout`](com.spotify.scio.values.SCollectionWithFanout) and @scaladoc[`SCollectionWithHotKeyFanout`](com.spotify.scio.values.SCollectionWithHotKeyFanout) for fanout of skewed data. +- @scaladoc[SCollectionWithSideInput](com.spotify.scio.values.SCollectionWithSideInput) for replicating small `SCollection`s to all left-hand side values in a large `SCollection`. +- @scaladoc[SCollectionWithSideOutput](com.spotify.scio.values.SCollectionWithSideOutput) for output to multiple SCollections. +- @scaladoc[WindowedSCollection](com.spotify.scio.values.WindowedSCollection) for accessing window information. +- @scaladoc[SCollectionWithFanout](com.spotify.scio.values.SCollectionWithFanout) and @scaladoc[SCollectionWithHotKeyFanout](com.spotify.scio.values.SCollectionWithHotKeyFanout) for fanout of skewed data. ## Additional features Scio also offers some additional features. -- Each worker can pull files from Google Cloud Storage via @scaladoc[`DistCache`](com.spotify.scio.values.DistCache) to be used in transforms locally, similar to Hadoop distributed cache. See @extref[DistCacheExample.scala](example:DistCacheExample). +- Each worker can pull files from Google Cloud Storage via @scaladoc[DistCache](com.spotify.scio.values.DistCache) to be used in transforms locally, similar to Hadoop distributed cache. See @extref[DistCacheExample.scala](example:DistCacheExample). - Type safe BigQuery IO via Scala macros. Case classes and converters are generated at compile time based on BQ schema. This eliminates the error prone process of handling generic JSON objects. See @extref[TypedBigQueryTornadoes.scala](example:TypedBigQueryTornadoes). - Sinks (`saveAs*` methods) return `ClosedTap[T]` that can be opened either in another pipeline as `SCollection[T]` or directly as `Iterator[T]` once the current pipeline completes. This enables complex pipeline orchestration. See @extref[WordCountOrchestration.scala](example:WordCountOrchestration). diff --git a/site/src/main/paradox/Scio-Unit-Tests.md b/site/src/main/paradox/Scio-Unit-Tests.md index 9f5793f0a8..804d80e610 100644 --- a/site/src/main/paradox/Scio-Unit-Tests.md +++ b/site/src/main/paradox/Scio-Unit-Tests.md @@ -35,7 +35,7 @@ Using `JobTest`, you can test the entire pipeline. Specify the type of the class The `input` function injects your input test data. Note that the `TestIO[T]` should match the input source used in the pipeline e.g. TextIO for sc.textFile, AvroIO for sc.avro. The TextIO id (“in.txt”) should match the one specified in the args. -The output function evaluates the output of the pipeline using the provided assertion from the `SCollectionMatchers`. More info on `SCollectionMatchers` can be found [here](https://spotify.github.io/scio/api/com/spotify/scio/testing/SCollectionMatchers.html). In this example, we are asserting that the output of the pipeline should contain an `SCollection` with elements that in the expected variable in any order. +The output function evaluates the output of the pipeline using the provided assertion from the `SCollectionMatchers`. More info on `SCollectionMatchers` can be found @scaladoc[here](com.spotify.scio.testing.SCollectionMatchers$). In this example, we are asserting that the output of the pipeline should contain an `SCollection` with elements that in the expected variable in any order. Also, note that the `TestIO[T]` should match the output used in the pipeline e.g. TextIO for sc.saveAsTextFile The run function will run the pipeline. @@ -55,7 +55,7 @@ Since we have two input sources, we have to specify both in the `JobTest`. Note ### Test partial pipeline To test a section of a pipeline, use `runWithContext`. The TriggerExample.extractFlowInfo test in @github[TriggerExampleTest](/scio-examples/src/test/scala/com/spotify/scio/examples/cookbook/TriggerExampleTest.scala) tests only the extractFlowInfo part of the pipeline. -The data variable hold the test data and `sc.parallelize` will transform the input iterable to a `SCollection` of strings. TriggerExample.extractFlowInfo will be executed using the `ScioContext` and you can then specify assertions against the result of the pipeline. +The data variable hold the test data and `sc.parallelize` will transform the input Iterable to an `SCollection` of strings. TriggerExample.extractFlowInfo will be executed using the `ScioContext` and you can then specify assertions against the result of the pipeline. @@snip [TriggerExampleTest.scala](/scio-examples/src/test/scala/com/spotify/scio/examples/cookbook/TriggerExampleTest.scala) { #TriggerExampleTest_example } @@ -82,7 +82,7 @@ To run the test, we use the `runWithContext`, this will run calculateTeamScores @@snip [LeaderBoardTest.scala](/scio-examples/src/test/scala/com/spotify/scio/examples/complete/game/LeaderBoardTest.scala) { #LeaderBoardTest_example_3 } -Scio provides more `SCollection` assertions such as `inWindow`, `inCombinedNonLatePanes`, `inFinalPane`, and `inOnlyPane`. You can find the full list [here](https://spotify.github.io/scio/api/com/spotify/scio/testing/SCollectionMatchers.html). More information on testing unbounded pipelines can be found [here](https://beam.apache.org/blog/2016/10/20/test-stream.html). +Scio provides more `SCollection` assertions such as `inWindow`, `inCombinedNonLatePanes`, `inFinalPane`, and `inOnlyPane`. You can find the full list @scaladoc[here](com.spotify.scio.testing.SCollectionMatchers). More information on testing unbounded pipelines can be found [here](https://beam.apache.org/blog/2016/10/20/test-stream.html). ### Test with transform overrides diff --git a/site/src/main/paradox/SideInputs.md b/site/src/main/paradox/SideInputs.md new file mode 100644 index 0000000000..acc9235271 --- /dev/null +++ b/site/src/main/paradox/SideInputs.md @@ -0,0 +1,102 @@ +# Side Inputs + +Side inputs provide a way to broadcast small amounts of data to all workers. + +Side inputs are more performant if they fit entirely into memory. +We therefore recommend using the @ref[singleton variants](SideInputs.md#singleton-variants) if possible, and setting the @ref[--workerCacheMb option](SideInputs.md#workercachemb-option). +For a dataflow job on a standard worker we recommend a maximum size of roughly 1GB for a side input. +If you have a need for a larger side input, see the section on @ref[Sparkey side inputs](extras/Sparkey.md#as-a-side-input). + +See also the [Beam Programming Guide's section on Side Inputs](https://beam.apache.org/documentation/programming-guide/#side-inputs) which provides some additional details. + +## Standard side inputs + +Converting an `SCollection` to a side-input `Seq` or `Iterable` is supported via `asListSideInput` and `asIterableSideInput` respectively: + +```scala mdoc:compile-only +import com.spotify.scio.values.{SCollection, SideInput} + +val stringElements: SCollection[String] = ??? +val stringListSI: SideInput[Seq[String]] = stringElements.asListSideInput +val stringIterSI: SideInput[Iterable[String]] = stringElements.asIterableSideInput +``` + +For keyed `SCollections`, Scio provides `asMapSideInput` for when there is a unique key-value relationship and `asMultiMapSideInput` for when a key may have multiple values: + +```scala mdoc:compile-only +import com.spotify.scio.values.{SCollection, SideInput} + +val keyedElements: SCollection[(String, String)] = ??? +val mapSingleSI: SideInput[Map[String, String]] = keyedElements.asMapSideInput +val mapMultiSI: SideInput[Map[String, Iterable[String]]] = keyedElements.asMultiMapSideInput +``` + +## Singleton variants + +In addition to standard Beam `SideInput`s, Scio also provides `Singleton` variants that are often more performant than the Beam defaults. + +For `SCollection`s with a single element, `asSingletonSideInput` will convert it to a side input: + +```scala mdoc:compile-only +import com.spotify.scio.values.{SCollection, SideInput} + +val elements: SCollection[Int] = ??? +val sumSI: SideInput[Int] = elements.sum.asSingletonSideInput +``` + +To get an `SideInput` of `Set[T]`, use `asSetSingletonSideInput`: + +```scala mdoc:compile-only +import com.spotify.scio.values.{SCollection, SideInput} + +val elements: SCollection[String] = ??? +val setSI: SideInput[Set[String]] = elements.asSetSingletonSideInput +``` + +For keyed `SCollection`s, `asMapSingletonSideInput` for when there is a unique key-value relationship and `asMultiMapSingletonSideInput` for when a key may have multiple values: + +```scala mdoc:compile-only +import com.spotify.scio.values.{SCollection, SideInput} + +val keyedElements: SCollection[(String, String)] = ??? +val mapSingleSI: SideInput[Map[String, String]] = keyedElements.asMapSingletonSideInput +val mapMultiSI: SideInput[Map[String, Iterable[String]]] = keyedElements.asMultiMapSingletonSideInput +``` + +## Side input context + +To 'join' a `SideInput`, use `withSideInputs`, then access it via the `SideInputContext`: + +```scala +import com.spotify.scio.values.{SCollection, SideInput} + +val keyedElements: SCollection[(String, String)] = ??? +val mapSingleSI: SideInput[Map[String, String]] = keyedElements.asMapSingletonSideInput + +val elements: SCollection[String] = ??? +elements + .withSideInputs(mapSingleSI) + .map { case (element, ctx) => + val mapSingle: Map[String, String] = ctx(mapSingleSI) + val value: Option[String] = mapSingle.get(element) + value + } +``` + +## workerCacheMb option + +By default, Dataflow workers allocate 100MB (see @javadoc[DataflowWorkerHarnessOptions#getWorkerCacheMb](org.apache.beam.runners.dataflow.options.DataflowWorkerHarnessOptions#getWorkerCacheMb--)) of memory for caching side inputs, and falls back to disk or network. +Jobs with large side inputs may therefore be slow. +To override this default, register `DataflowWorkerHarnessOptions` before parsing command line arguments and then pass `--workerCacheMb=N` when submitting the job: + +```scala mdoc:compile-only +import com.spotify.scio._ +import org.apache.beam.sdk.options.PipelineOptionsFactory +import org.apache.beam.runners.dataflow.options.DataflowWorkerHarnessOptions + +def main(cmdlineArgs: Array[String]): Unit = { + PipelineOptionsFactory.register(classOf[DataflowWorkerHarnessOptions]) + val (sc, args) = ContextAndArgs(cmdlineArgs) + ??? +} +``` diff --git a/site/src/main/paradox/dev/How-to-Release.md b/site/src/main/paradox/dev/How-to-Release.md index e92b572d66..03c6a2f935 100644 --- a/site/src/main/paradox/dev/How-to-Release.md +++ b/site/src/main/paradox/dev/How-to-Release.md @@ -17,11 +17,9 @@ Credentials( - Create a PGP key, for example on [keybase.io](https://keybase.io/), and [distribute](https://www.gnupg.org/gph/en/manual/x457.html) it to a public keyserver -## Update documentation and version matrix +## Update version matrix -- Pick a release name from [here](https://en.wikipedia.org/wiki/List_of_Latin_phrases_%28full%29), [here](https://en.wikipedia.org/wiki/List_of_songs_with_Latin_lyrics), [here](https://harrypotter.fandom.com/wiki/List_of_spells), [here](https://en.wikipedia.org/wiki/List_of_Latin_names_of_cities), [here](https://en.wikipedia.org/wiki/List_of_Latin_names_of_countries), or other interesting sources* -- Update the list of release names below -- If the release includes a Beam version bump, update the @ref:[version matrix](../Apache-Beam.md) +If the release includes a Beam version bump, update the @ref:[version matrix](../releases/Apache-Beam.md) ## Automatic (CI) @@ -56,136 +54,3 @@ git push origin vX.Y.Z - Send external announcement to scio-users@googlegroups.com and user@beam.apache.org - Announce on public [Slack](https://slackin.spotify.com/) - Announce on Twitter - -*Starting with `0.4.0` all release names are scientific names of animals with genus and species starting with the same letter, in ascending alphabetical order; Harry Potter spells starting with `0.8.0`; Latin names of cities in ascending alphabetical order starting `0.10.0`; Latin names of countries in ascending alphabetical order starting `0.11.0`. - -## Past release names -### 0.12.x - -- [v0.12.5](https://github.com/spotify/scio/releases/tag/v0.12.5) - _"Hispania"_ -- [v0.12.4](https://github.com/spotify/scio/releases/tag/v0.12.4) - _"Hibernia"_ -- [v0.12.3](https://github.com/spotify/scio/releases/tag/v0.12.3) - _"Helvetia"_ -- [v0.12.2](https://github.com/spotify/scio/releases/tag/v0.12.2) - _"Graecia"_ -- [v0.12.1](https://github.com/spotify/scio/releases/tag/v0.12.1) - _"Gallia"_ -- [v0.12.0](https://github.com/spotify/scio/releases/tag/v0.12.0) - _"Dalmatia"_ - -### 0.11.x - -- [v0.11.14](https://github.com/spotify/scio/releases/tag/v0.11.14) - _"Germania"_ -- [v0.11.13](https://github.com/spotify/scio/releases/tag/v0.11.13) - _"Galatia"_ -- [v0.11.12](https://github.com/spotify/scio/releases/tag/v0.11.12) - _"Formosa"_ -- [v0.11.11](https://github.com/spotify/scio/releases/tag/v0.11.11) - _"Finnia"_ -- [v0.11.10](https://github.com/spotify/scio/releases/tag/v0.11.10) - _"Dania"_ -- [v0.11.9](https://github.com/spotify/scio/releases/tag/v0.11.9) - _"Dacia"_ -- [v0.11.8](https://github.com/spotify/scio/releases/tag/v0.11.8) - _"Cyrenaica"_ -- [v0.11.7](https://github.com/spotify/scio/releases/tag/v0.11.7) - _"Creta"_ -- [v0.11.6](https://github.com/spotify/scio/releases/tag/v0.11.6) - _"Colchis"_ -- [v0.11.5](https://github.com/spotify/scio/releases/tag/v0.11.5) - _"Cambria"_ -- [v0.11.4](https://github.com/spotify/scio/releases/tag/v0.11.4) - _"Britannia"_ -- [v0.11.3](https://github.com/spotify/scio/releases/tag/v0.11.3) - _"Bithynia"_ -- [v0.11.2](https://github.com/spotify/scio/releases/tag/v0.11.2) - _"Batavia"_ -- [v0.11.1](https://github.com/spotify/scio/releases/tag/v0.11.1) - _"Armorica"_ -- [v0.11.0](https://github.com/spotify/scio/releases/tag/v0.11.0) - _"Ariana"_ - -### 0.10.x - -- [v0.10.4](https://github.com/spotify/scio/releases/tag/v0.10.4) - _"Edessa"_ -- [v0.10.3](https://github.com/spotify/scio/releases/tag/v0.10.3) - _"Dallasium"_ -- [v0.10.2](https://github.com/spotify/scio/releases/tag/v0.10.2) - _"Cantabrigia"_ -- [v0.10.1](https://github.com/spotify/scio/releases/tag/v0.10.1) - _"Belli Horizontis"_ -- [v0.10.0](https://github.com/spotify/scio/releases/tag/v0.10.0) - _"Aquae Sextiae"_ - -### 0.9.x - -- [v0.9.6](https://github.com/spotify/scio/releases/tag/v0.9.6) - _"Specialis Revelio"_ -- [v0.9.5](https://github.com/spotify/scio/releases/tag/v0.9.5) - _"Colovaria"_ -- [v0.9.4](https://github.com/spotify/scio/releases/tag/v0.9.4) - _"Deletrius"_ -- [v0.9.3](https://github.com/spotify/scio/releases/tag/v0.9.3) - _"Petrificus Totalus"_ -- [v0.9.2](https://github.com/spotify/scio/releases/tag/v0.9.2) - _"Alohomora"_ -- [v0.9.1](https://github.com/spotify/scio/releases/tag/v0.9.1) - _"Aberto"_ -- [v0.9.0](https://github.com/spotify/scio/releases/tag/v0.9.0) - _"Furnunculus"_ - -### 0.8.x - -- [v0.8.4](https://github.com/spotify/scio/releases/tag/v0.8.4) - _"Expecto Patronum"_ -- [v0.8.3](https://github.com/spotify/scio/releases/tag/v0.8.3) - _"Draconifors"_ -- [v0.8.2](https://github.com/spotify/scio/releases/tag/v0.8.2) - _"Capacious Extremis"_ -- [v0.8.1](https://github.com/spotify/scio/releases/tag/v0.8.1) - _"Bombarda Maxima"_ -- [v0.8.0](https://github.com/spotify/scio/releases/tag/v0.8.0) - _"Amato Animo Animato Animagus"_ - -### 0.7.x - -- [v0.7.4](https://github.com/spotify/scio/releases/tag/v0.7.4) - _"Watsonula wautieri"_ -- [v0.7.3](https://github.com/spotify/scio/releases/tag/v0.7.3) - _"Vulpes Vulpes"_ -- [v0.7.2](https://github.com/spotify/scio/releases/tag/v0.7.2) - _"Ursus t. Ussuricus"_ -- [v0.7.1](https://github.com/spotify/scio/releases/tag/v0.7.1) - _"Taxidea Taxus"_ -- [v0.7.0](https://github.com/spotify/scio/releases/tag/v0.7.0) - _"Suricata suricatta"_ - -### 0.6.x - -- [v0.6.1](https://github.com/spotify/scio/releases/tag/v0.6.1) - _"Rhyncholestes raphanurus"_ -- [v0.6.0](https://github.com/spotify/scio/releases/tag/v0.6.0) - _"Quelea Quelea"_ - -### 0.5.x - -- [v0.5.7](https://github.com/spotify/scio/releases/tag/v0.5.7) - _"Panthera pardus"_ -- [v0.5.6](https://github.com/spotify/scio/releases/tag/v0.5.6) - _"Orcinus orca"_ -- [v0.5.5](https://github.com/spotify/scio/releases/tag/v0.5.5) - _"Nesolagus netscheri"_ -- [v0.5.4](https://github.com/spotify/scio/releases/tag/v0.5.4) - _"Marmota monax"_ -- [v0.5.3](https://github.com/spotify/scio/releases/tag/v0.5.3) - _"Lasiorhinus latifrons"_ -- [v0.5.2](https://github.com/spotify/scio/releases/tag/v0.5.2) - _"Kobus kob"_ -- [v0.5.1](https://github.com/spotify/scio/releases/tag/v0.5.1) - _"Jaculus jerboa"_ -- [v0.5.0](https://github.com/spotify/scio/releases/tag/v0.5.0) - _"Ia io"_ - -### 0.4.x - -- [v0.4.7](https://github.com/spotify/scio/releases/tag/v0.4.7) - _"Hydrochoerus hydrochaeris"_ -- [v0.4.6](https://github.com/spotify/scio/releases/tag/v0.4.6) - _"Galago gallarum"_ -- [v0.4.5](https://github.com/spotify/scio/releases/tag/v0.4.5) - _"Felis ferus"_ -- [v0.4.4](https://github.com/spotify/scio/releases/tag/v0.4.4) - _"Erinaceus europaeus"_ -- [v0.4.3](https://github.com/spotify/scio/releases/tag/v0.4.3) - _"Dendrohyrax dorsalis"_ -- [v0.4.2](https://github.com/spotify/scio/releases/tag/v0.4.2) - _"Castor canadensis"_ -- [v0.4.1](https://github.com/spotify/scio/releases/tag/v0.4.1) - _"Blarina brevicauda"_ -- [v0.4.0](https://github.com/spotify/scio/releases/tag/v0.4.0) - _"Atelerix albiventris"_ - -### 0.3.x - -- [v0.3.6](https://github.com/spotify/scio/releases/tag/v0.3.6) - _"Veritas odit moras"_ -- [v0.3.5](https://github.com/spotify/scio/releases/tag/v0.3.5) - _"Unitas, veritas, carnitas"_ -- [v0.3.4](https://github.com/spotify/scio/releases/tag/v0.3.4) - _"Sectumsempra"_ -- [v0.3.3](https://github.com/spotify/scio/releases/tag/v0.3.3) - _"Petrificus totalus"_ -- [v0.3.2](https://github.com/spotify/scio/releases/tag/v0.3.2) - _"Ut tensio sic vis"_ -- [v0.3.1](https://github.com/spotify/scio/releases/tag/v0.3.1) - _"Expecto patronum"_ -- [v0.3.0](https://github.com/spotify/scio/releases/tag/v0.3.0) - _"Lux et veritas"_ - -### 0.2.x - -- [v0.2.13](https://github.com/spotify/scio/releases/tag/v0.2.13) - _"Ex luna scientia"_ -- [v0.2.12](https://github.com/spotify/scio/releases/tag/v0.2.12) - _"In extremo"_ -- [v0.2.11](https://github.com/spotify/scio/releases/tag/v0.2.11) - _"Saltatio mortis"_ -- [v0.2.10](https://github.com/spotify/scio/releases/tag/v0.2.10) - _"De Mysteriis Dom Sathanas"_ -- [v0.2.9](https://github.com/spotify/scio/releases/tag/v0.2.9) - _"Hoc tempore atque nunc et semper"_ -- [v0.2.8](https://github.com/spotify/scio/releases/tag/v0.2.8) - _"Consummatum est"_ -- [v0.2.7](https://github.com/spotify/scio/releases/tag/v0.2.7) - _"Crescat scientia vita excolatur"_ -- [v0.2.6](https://github.com/spotify/scio/releases/tag/v0.2.6) - _"Sensu lato"_ -- [v0.2.5](https://github.com/spotify/scio/releases/tag/v0.2.5) - _"Imperium in imperio"_ -- [v0.2.4](https://github.com/spotify/scio/releases/tag/v0.2.4) - _"Ab imo pectore"_ -- [v0.2.3](https://github.com/spotify/scio/releases/tag/v0.2.3) - _"Aurea mediocritas"_ -- [v0.2.2](https://github.com/spotify/scio/releases/tag/v0.2.2) - _"Intelligenti pauca"_ -- [v0.2.1](https://github.com/spotify/scio/releases/tag/v0.2.1) - _"Sedes incertae"_ -- [v0.2.0](https://github.com/spotify/scio/releases/tag/v0.2.0) - _"Nulli secundus"_ - -### 0.1.x - -- [v0.1.11](https://github.com/spotify/scio/releases/tag/v0.1.11) - _"In silico"_ -- [v0.1.10](https://github.com/spotify/scio/releases/tag/v0.1.10) - _"Memento vivere"_ -- [v0.1.9](https://github.com/spotify/scio/releases/tag/v0.1.9) - _"Lucem sequimur"_ -- [v0.1.8](https://github.com/spotify/scio/releases/tag/v0.1.8) - _"Nemo saltat sobrius"_ -- [v0.1.7](https://github.com/spotify/scio/releases/tag/v0.1.7) - _"Spem gregis"_ -- [v0.1.6](https://github.com/spotify/scio/releases/tag/v0.1.6) - _"Sic infit"_ -- [v0.1.5](https://github.com/spotify/scio/releases/tag/v0.1.5) - _"Ad astra"_ -- [v0.1.4](https://github.com/spotify/scio/releases/tag/v0.1.4) - _"Ad arbitrium"_ -- [v0.1.3](https://github.com/spotify/scio/releases/tag/v0.1.3) - _"Ut cognoscant te"_ -- [v0.1.2](https://github.com/spotify/scio/releases/tag/v0.1.2) - _"Sapere aude"_ -- [v0.1.1](https://github.com/spotify/scio/releases/tag/v0.1.1) - _"Festina lente"_ -- [v0.1.0](https://github.com/spotify/scio/releases/tag/v0.1.0) - _"Scio me nihil scire"_ diff --git a/site/src/main/paradox/dev/build.md b/site/src/main/paradox/dev/build.md index b5638522ef..7f264c99db 100644 --- a/site/src/main/paradox/dev/build.md +++ b/site/src/main/paradox/dev/build.md @@ -30,7 +30,7 @@ Define `bigquery.project` as a system property. The value can by anything since sbt -Dbigquery.project=dummy-project test ``` -Tasks within the 'it' (integration testing) configuration `it:{compile,test}` currently require access to datasets hosted in an internal Spotify project. External users must authenticate against their own GCP project, through the steps outlined in [Getting Started](https://spotify.github.io/scio/Getting-Started.html). +Tasks within the 'it' (integration testing) configuration `it:{compile,test}` currently require access to datasets hosted in an internal Spotify project. External users must authenticate against their own GCP project, through the steps outlined in @ref[Getting Started](../Getting-Started.md). ## IntelliJ IDEA diff --git a/site/src/main/paradox/extras/Algebird.md b/site/src/main/paradox/extras/Algebird.md index 5d95425c80..b0b3351834 100644 --- a/site/src/main/paradox/extras/Algebird.md +++ b/site/src/main/paradox/extras/Algebird.md @@ -1,8 +1,8 @@ # Algebird [Algebird](https://github.com/twitter/algebird) is Twitter's abstract algebra library. It has a lot of reusable modules for parallel aggregation and approximation. One can use any Algebird `Aggregator` or `Semigroup` with: -- `aggregate` and `sum` on @scaladoc[`SCollection[T]`](com.spotify.scio.values.SCollection) -- `aggregateByKey` and `sumByKey` on @scaladoc[`SCollection[(K, V)]`](com.spotify.scio.values.PairSCollectionFunctions) +- `aggregate` and `sum` on @scaladoc[SCollection[T]](com.spotify.scio.values.SCollection) +- `aggregateByKey` and `sumByKey` on @scaladoc[SCollection[(K, V)]](com.spotify.scio.values.PairSCollectionFunctions) See @github[AlgebirdSpec.scala](/scio-examples/src/test/scala/com/spotify/scio/examples/extra/AlgebirdSpec.scala) and [Algebird wiki](https://github.com/twitter/algebird/wiki) for more details. Also see these [slides](http://www.lyh.me/slides/semigroups.html) on semigroups. diff --git a/site/src/main/paradox/extras/Annoy.md b/site/src/main/paradox/extras/Annoy.md new file mode 100644 index 0000000000..0e6769f838 --- /dev/null +++ b/site/src/main/paradox/extras/Annoy.md @@ -0,0 +1,65 @@ +# Annoy + +Scio integrates with Spotify's [Annoy](https://github.com/spotify/annoy), an approximate nearest neighbors library, via [annoy-java](https://github.com/spotify/annoy-java) and [annoy4s](https://github.com/annoy4s/annoy4s). + +## Write + +A keyed `SCollection` with `Int` keys and `Array[Float]` vector values can be saved with @scaladoc[asAnnoy](com.spotify.scio.extra.annoy.AnnoyPairSCollection#asAnnoy(path:String,metric:com.spotify.scio.extra.annoy.package.AnnoyMetric,dim:Int,nTrees:Int):com.spotify.scio.values.SCollection[com.spotify.scio.extra.annoy.AnnoyUri]): + +```scala +import com.spotify.scio.values.SCollection +import com.spotify.scio.extra.annoy._ + +val metric: AnnoyMetric = ??? +val numDimensions: Int = ??? +val numTrees: Int = ??? +val itemVectors: SCollection[(Int, Array[Float])] = ??? +itemVectors.asAnnoy("gs://output-path", metric, numDimensions, numTrees) +``` + +## Side Input + +An Annoy file can be read directly as a `SideInput` with @scaladoc[annoySideInput](com.spotify.scio.extra.annoy.AnnoyScioContext#annoySideInput(path:String,metric:com.spotify.scio.extra.annoy.package.AnnoyMetric,dim:Int):com.spotify.scio.values.SideInput[com.spotify.scio.extra.annoy.package.AnnoyReader]): + +```scala +import com.spotify.scio._ +import com.spotify.scio.values.SideInput +import com.spotify.scio.extra.annoy._ + +val sc: ScioContext = ??? + +val metric: AnnoyMetric = ??? +val numDimensions: Int = ??? +val annoySI: SideInput[AnnoyReader] = sc.annoySideInput("gs://input-path", metric, numDimensions) +``` + +Alternatively, an `SCollection` can be converted directly to a `SideInput` with @scaladoc +[`asAnnoySideInput`](com.spotify.scio.extra.annoy.AnnoyPairSCollection#asAnnoySideInput(metric:com.spotify.scio.extra.annoy.package.AnnoyMetric,dim:Int):com.spotify.scio.values.SideInput[com.spotify.scio.extra.annoy.package.AnnoyReader]): + +```scala +import com.spotify.scio.values.{SCollection, SideInput} +import com.spotify.scio.extra.annoy._ + +val metric: AnnoyMetric = ??? +val numDimensions: Int = ??? +val numTrees: Int = ??? +val itemVectors: SCollection[(Int, Array[Float])] = ??? +val annoySI: SideInput[AnnoyReader] = itemVectors.asAnnoySideInput(metric, numDimensions, numTrees) +``` + +An @scaladoc[AnnoyReader](com.spotify.scio.extra.annoy.AnnoyReader) provides access to item vectors and their nearest neighbors: + +```scala +import com.spotify.scio.values.{SCollection, SideInput} +import com.spotify.scio.extra.annoy._ + +val annoySI: SideInput[AnnoyReader] = ??? +val elements: SCollection[Int] = ??? +elements + .withSideInputs(annoySI) + .map { case (element, ctx) => + val annoyReader: AnnoyReader = ctx(annoySI) + val vec: Array[Float] = annoyReader.getItemVector(element) + element -> annoyReader.getNearest(vec, 1) + } +``` diff --git a/site/src/main/paradox/extras/AsyncDoFn.md b/site/src/main/paradox/extras/AsyncDoFn.md new file mode 100644 index 0000000000..5f6719e9a2 --- /dev/null +++ b/site/src/main/paradox/extras/AsyncDoFn.md @@ -0,0 +1,42 @@ +# AsyncDoFn + +Scio's @scaladoc[BaseAsyncDoFn](com.spotify.scio.transforms.BaseAsyncDoFn) provides standard handling for sending asynchronous requests and capturing the responses for a bundle of pipeline elements. +`BaseAsyncDoFn` is a subclass of @scaladoc[DoFnWithResource](com.spotify.scio.transforms.DoFnWithResource) which handles the creation and re-use of client classes. +Scio provides several future-specific subclasses to choose from depending on the return type of the client: + +* @scaladoc[GuavaAsyncDoFn](com.spotify.scio.transforms.GuavaAsyncDoFn) for clients that return Guava's `ListenableFuture` +* @scaladoc[JavaAsyncDoFn](com.spotify.scio.transforms.JavaAsyncDoFn) for clients that return `CompletableFuture` +* @scaladoc[ScalaAsyncDoFn](com.spotify.scio.transforms.ScalaAsyncDoFn) for clients that return a scala `Future` + +`BaseAsyncDoFn` will wait for all futures for all bundle elements to be returned before completing the bundle. +A failure of any request for an item in the bundle will cause the entire bundle to be retried. +Requests should therefore be idempotent. + +Given this Guava-based mock client: +```scala +import com.google.common.util.concurrent.{ListenableFuture, Futures} + +case class MyClient(value: String) { + def request(i: Int): ListenableFuture[String] = Futures.immediateFuture(s"$value$i") +} +``` + +For client which returns a `ListenableFuture`, a custom `DoFn` can be defined using `GuavaAsyncDoFn`. +Note the configured `ResourceType`, which will re-use the client for all threads on a worker, see @scaladoc[ResourceType](com.spotify.scio.transforms.DoFnWithResource.ResourceType) for more details. + +```scala +import com.spotify.scio.transforms._ +import com.spotify.scio.transforms.DoFnWithResource.ResourceType +import com.spotify.scio.values.SCollection +import org.apache.beam.sdk.transforms.ParDo + +class MyDoFn() extends GuavaAsyncDoFn[Int, String, MyClient] { + override def getResourceType: ResourceType = ResourceType.PER_CLASS + override def createResource(): MyClient = MyClient("foo") + override def processElement(input: Int): ListenableFuture[String] = + getResource.request(input) +} + +val elements: SCollection[Int] = ??? +val result: SCollection[String] = elements.applyTransform(ParDo.of(new MyDoFn())) +``` diff --git a/site/src/main/paradox/extras/BigQueryAvro.md b/site/src/main/paradox/extras/BigQueryAvro.md new file mode 100644 index 0000000000..07d08b3668 --- /dev/null +++ b/site/src/main/paradox/extras/BigQueryAvro.md @@ -0,0 +1,13 @@ +# BigQueryAvro + +Scio provides support for converting Avro schemas to BigQuery [`TableSchema`s](https://developers.google.com/resources/api-libraries/documentation/bigquery/v2/java/latest/com/google/api/services/bigquery/model/TableSchema.html) and Avro `SpecificRecord`s to a BigQuery [`TableRow`s](https://developers.google.com/resources/api-libraries/documentation/bigquery/v2/java/latest/com/google/api/services/bigquery/model/TableRow.html). + +```scala mdoc:compile-only +import com.spotify.scio.extra.bigquery.AvroConverters +import org.apache.avro.specific.SpecificRecord +import com.google.api.services.bigquery.model.{TableFieldSchema, TableSchema, TableRow} + +val myAvroInstance: SpecificRecord = ??? +val bqSchema: TableSchema = AvroConverters.toTableSchema(myAvroInstance.getSchema) +val bqRow: TableRow = AvroConverters.toTableRow(myAvroInstance) +``` diff --git a/site/src/main/paradox/extras/DistCache.md b/site/src/main/paradox/extras/DistCache.md new file mode 100644 index 0000000000..fda49afd23 --- /dev/null +++ b/site/src/main/paradox/extras/DistCache.md @@ -0,0 +1,26 @@ +# DistCache + +Scio supports a distributed cache, @scaladoc[DistCache](com.spotify.scio.values.DistCache), that is similar to Hadoop's. + +A set of one or more paths that back the DistCache are lazily downloaded by all workers, then passed through a user-defined initialization function `initFn` to be deserialized into an in-memory representation that can be used by all threads on that worker. + +```scala mdoc:compile-only +import com.spotify.scio._ +import com.spotify.scio.values.SCollection +import org.joda.time.Instant +import java.io.File + +val sc: ScioContext = ??? +val uri: String = ??? +def parseFn(file: File): Map[String, String] = ??? + +val dc = sc.distCache(uri) { file => parseFn(file) } + +val elements: SCollection[String] = ??? +elements.flatMap { e => + val optResult = dc().get(e) + optResult +} +``` + +See @extref[DistCacheExample.scala](example:DistCacheExample). diff --git a/site/src/main/paradox/extras/Fanout.md b/site/src/main/paradox/extras/Fanout.md new file mode 100644 index 0000000000..2c62f93c42 --- /dev/null +++ b/site/src/main/paradox/extras/Fanout.md @@ -0,0 +1,34 @@ +# Fanout + +Scio ships with two `SCollection` variants that provide _fanout_ over aggregations where an interim aggregation is performed before the final aggregation is computed. +The interim step pairs the data to be aggregated with a synthetic key, then aggregates within this artificial keyspace before passing the partial aggregations on to the final aggregation step. +The interim step requires an additional shuffle but can make the aggregation more parallelizable and reduces the impact of a hot key. + +The `aggregate`, `combine`, `fold`, `reduce`, `sum` transforms and their keyed variants are supported. + +## WithFanout + +@scaladoc[withFanout](com.spotify.scio.values.SCollection#withFanout(fanout:Int):com.spotify.scio.values.SCollectionWithFanout[T]) aggregates over the number of synthetic keys specified by the `fanout` argument: + +```scala mdoc:compile-only +import com.spotify.scio._ +import com.spotify.scio.values.SCollection + +val elements: SCollection[Int] = ??? +val result: SCollection[Int] = elements.withFanout(fanout = 10).sum +``` + +## WithHotKeyFanout + +For hot keys, two variants allow a user to specify either a static fanout via an integer `hotKeyFanout` argument to @scaladoc[withHotKeyFanout](com.spotify.scio.values.PairSCollectionFunctions#withHotKeyFanout(hotKeyFanout:Int):com.spotify.scio.values.SCollectionWithHotKeyFanout[K,V]), or a dynamic per-key fanout via a function `K => Int` argument, also called `hotKeyFanout` to @scaladoc[withHotKeyFanout](com.spotify.scio.values.PairSCollectionFunctions#withHotKeyFanout(hotKeyFanout:K=%3EInt):com.spotify.scio.values.SCollectionWithHotKeyFanout[K,V]): + +```scala +import com.spotify.scio._ +import com.spotify.scio.values.SCollection + +val elements: SCollection[(String, Int)] = ??? +val staticResult: SCollection[(String, Int)] = elements.withHotKeyFanout(hotKeyFanout = 10).sumByKey +val dynamicResult: SCollection[(String, Int)] = elements + .withHotKeyFanout(hotKeyFanout = s => s.length % 10) + .sumByKey +``` diff --git a/site/src/main/paradox/extras/MutableScalableBloomFilter.md b/site/src/main/paradox/extras/MutableScalableBloomFilter.md new file mode 100644 index 0000000000..7b09f1d182 --- /dev/null +++ b/site/src/main/paradox/extras/MutableScalableBloomFilter.md @@ -0,0 +1,34 @@ +# MutableScalableBloomFilter + +Scio ships with an implementation of a scalable Bloom Filter, as described in ["Scalable Bloom Filters", Almeida, Baquero, et al.](http://gsd.di.uminho.pt/members/cbm/ps/dbloom.pdf). + +A Bloom filter is an _approximate_ data structure that behaves in a similar way to a `Set` and can answer the question "does this set _probably_ contain this value?". + +As an example, if you want to be able to answer the question "Does this user listen to Beyonce?" you could construct a `Set` containing all the ids of all the users that listened to Beyonce, persist it, the do a lookup into the set every time you need to know. +The issue is that the `Set` will quickly become very large, especially for such a popular artist, and if you want to maintain many such sets, you will run into scaling issues. +Bloom filters solve this problem by accepting some false positives for significant compression. + +A _scalable_ Bloom filter is a sequence of Bloom filters that are iteratively added-to once each constituent filter nears its capacity (the point at which false positive guarantees break down). +This is useful because inputs to a Bloom filter are lost, and it is not possible to resize a filter once constructed. +The [MutableScalableBloomFilter](com.spotify.scio.hash.MutableScalableBloomFilter) implementation shipping with scio maintains some additional metadata which allows it to scale automatically when necessary. + +See the @scaladoc[MutableScalableBloomFilter](com.spotify.scio.hash.MutableScalableBloomFilter$) for details on how to properly size a scalable Bloom filter. + +```scala mdoc:compile-only +import com.spotify.scio._ +import com.spotify.scio.values.SCollection +import com.spotify.scio.hash.MutableScalableBloomFilter +import magnolify.guava.auto._ + +case class TrackListen(trackId: String, userId: String) + +val elements: SCollection[TrackListen] = ??? +val msbfs: SCollection[MutableScalableBloomFilter[String]] = elements + .map { t => t.trackId -> t.userId } + .groupByKey + .map { case (trackId, userIds) => + val msbf = MutableScalableBloomFilter[String](1_000_000) + msbf ++= userIds + msbf + } +``` diff --git a/site/src/main/paradox/Scio-REPL.md b/site/src/main/paradox/extras/Scio-REPL.md similarity index 93% rename from site/src/main/paradox/Scio-REPL.md rename to site/src/main/paradox/extras/Scio-REPL.md index c28c7ada28..f34a69f0d4 100644 --- a/site/src/main/paradox/Scio-REPL.md +++ b/site/src/main/paradox/extras/Scio-REPL.md @@ -37,7 +37,7 @@ Scio context available as 'sc' scio> ``` -A @scaladoc[`ScioContext`](com.spotify.scio.ScioContext) is created on REPL startup as `sc` and a starting point to most operations. Use `tab` completion, history and other REPL goodies to play around. +A @scaladoc[ScioContext](com.spotify.scio.ScioContext) is created on REPL startup as `sc` and a starting point to most operations. Use `tab` completion, history and other REPL goodies to play around. ### Start from SBT console (Scala `2.11.x`+ only) @@ -101,7 +101,7 @@ val scioResult = sc.run().waitUntilDone() val values = scioResult.tap(wordCount).value.take(3) ``` -Make sure `README.md` is in the current directory. This example counts words in local file using a local runner (@javadoc[`DirectRunner`](org.apache.beam.runners.direct.DirectRunner) and writes result in a local file. The pipeline and actual computation starts on `sc.run()`. The last command take 3 lines from results and prints them. +Make sure `README.md` is in the current directory. This example counts words in local file using a local runner (@javadoc[DirectRunner](org.apache.beam.runners.direct.DirectRunner) and writes result in a local file. The pipeline and actual computation starts on `sc.run()`. The last command take 3 lines from results and prints them. ### Local pipeline ++ @@ -183,7 +183,7 @@ val result = sc .take(3) ``` -In this case we are reading data from GCS and performing computation in GCE virtual machines managed by Dataflow service. The last line is an example of reading data from GCS files to local memory after a context is closed. Most write operations in Scio return `Future[Tap[T]]` where a [`Tap[T]`](http://spotify.github.io/scio/api/com/spotify/scio/io/Tap.html) encapsulates some dataset that can be re-opened in another context or directly. +In this case we are reading data from GCS and performing computation in GCE virtual machines managed by Dataflow service. The last line is an example of reading data from GCS files to local memory after a context is closed. Most write operations in Scio return `Future[Tap[T]]` where a @scaladoc[Tap[T]](com.spotify.scio.io.Tap) encapsulates some dataset that can be re-opened in another context or directly. Use `:scioOpts` to view or update Dataflow options inside the REPL. New options will be applied the next time you create a context. @@ -321,7 +321,7 @@ def result = sc.run() ### Running jobs asynchronously -When using REPL and Dataflow service consider using the non-blocking @javadoc[`DataflowRunner`](org.apache.beam.runners.dataflow.DataflowRunner) for a more interactive experience. To start: +When using REPL and Dataflow service consider using the non-blocking @javadoc[DataflowRunner](org.apache.beam.runners.dataflow.DataflowRunner) for a more interactive experience. To start: ``` java -jar scio-repl-0.7.0.jar \ @@ -366,7 +366,7 @@ def result = sc.run() def state = result.state ``` -Note that now `sc.run()` doesn't block and wait until job completes and gives back control of the REPL right away. Use @scaladoc[`ScioExecutionContext`](com.spotify.scio.ScioExecutionContext) to check for progress, results and orchestrate jobs. +Note that now `sc.run()` doesn't block and wait until job completes and gives back control of the REPL right away. Use @scaladoc[ScioExecutionContext](com.spotify.scio.ScioExecutionContext) to check for progress, results and orchestrate jobs. ### Multiple Scio contexts @@ -386,7 +386,7 @@ You can use those in combination with `DataflowRunner` to run multiple pipelines ### BigQuery client -Whenever possible leverage BigQuery! @scaladoc[`@BigQueryType`](com.spotify.scio.bigquery.types.BigQueryType) annotations enable type safe and civilized +Whenever possible leverage BigQuery! @scaladoc[@BigQueryType](com.spotify.scio.bigquery.types.BigQueryType) annotations enable type safe and civilized integration with BigQuery inside Scio. Here is example of using the annotations and BigQuery client to read and write typed data directly without Scio context. ``` diff --git a/site/src/main/paradox/extras/Sort-Merge-Bucket.md b/site/src/main/paradox/extras/Sort-Merge-Bucket.md index 92b2b8dd73..1a84e49888 100644 --- a/site/src/main/paradox/extras/Sort-Merge-Bucket.md +++ b/site/src/main/paradox/extras/Sort-Merge-Bucket.md @@ -1,7 +1,7 @@ # Sort Merge Bucket Sort Merge Bucket is a technique for writing data to file system in deterministic file locations, -sorted according by some pre-determined key, so that it can later be read in as key groups with +sorted according to some pre-determined key, so that it can later be read in as key groups with no shuffle required. Since each element is assigned a file destination (bucket) based on a hash of its join key, we can use the same technique to cogroup multiple Sources as long as they're written using the same key and hashing scheme. diff --git a/site/src/main/paradox/extras/Sorter.md b/site/src/main/paradox/extras/Sorter.md new file mode 100644 index 0000000000..a8bf2dd3be --- /dev/null +++ b/site/src/main/paradox/extras/Sorter.md @@ -0,0 +1,15 @@ +# Sorter + +The @scaladoc[sortValues](com.spotify.scio.extra.sorter.syntax.SorterOps#sortValues(memoryMB:Int)(implicitk1Coder:com.spotify.scio.coders.Coder[K1],implicitk2Coder:com.spotify.scio.coders.Coder[K2],implicitvCoder:com.spotify.scio.coders.Coder[V]):com.spotify.scio.values.SCollection[(K1,Iterable[(K2,V)])]) transform sorts values by a secondary key following a `groupByKey` on the primary key, spilling sorting to disk if required. +The `memoryMB` controls the allowable in-memory overhead before the sorter spills data to disk. +Keys are compared based on the byte-array representations produced by their Beam coder. + +```scala +import com.spotify.scio.values.SCollection +import com.spotify.scio.extra.sorter._ + +val elements: SCollection[(String, (String, Int))] = ??? +val sorted: SCollection[(String, Iterable[(String, Int)])] = elements + .groupByKey + .sortValues(memoryMB = 100) +``` diff --git a/site/src/main/paradox/extras/Sparkey.md b/site/src/main/paradox/extras/Sparkey.md new file mode 100644 index 0000000000..2fafaab6bd --- /dev/null +++ b/site/src/main/paradox/extras/Sparkey.md @@ -0,0 +1,63 @@ +# Sparkey + +Scio supports Spotify's [Sparkey](https://github.com/spotify/sparkey), which provides a simple disk-backed key-value store. + +At Spotify, sparkeys are typically used in pipelines as side-inputs where the size of the side-input would be too large to reasonably fit into memory but can still fit on disk. +Scio's suite of `largeHash` functions are backed by sparkeys. + +Scio supports writing any type with a coder to a sparkey by first converting + +## As a Side-Input + +A sparkey side-input is a good choice when you have a very large dataset that needs to be joined with a relatively small dataset, but one which is still too large to fit into memory. +In this case, the @scaladoc[asSparkeySideInput](com.spotify.scio.extra.sparkey.SparkeyPairSCollection#asSparkeySideInput(implicitw:com.spotify.scio.extra.sparkey.package.SparkeyWritable[K,V]):com.spotify.scio.values.SideInput[com.spotify.sparkey.SparkeyReader]) method can be used to broadcast the smaller dataset to all workers and avoid shuffle. + +```scala mdoc:compile-only +import com.spotify.scio.values.{SCollection, SideInput} +import com.spotify.scio.extra.sparkey._ +import com.spotify.sparkey._ + +case class Track(title: String, artistId: String) +case class ArtistMetadata(artistId: String, name: String) + +val tracks: SCollection[Track] = ??? +val metadata: SCollection[ArtistMetadata] = ??? + +val artistNameSI: SideInput[SparkeyReader] = metadata + .map { am => am.artistId -> am.name } + .asSparkeySideInput + +tracks.withSideInputs(artistNameSI) + .map { case (track, context) => + val optArtistName = context(artistNameSI).get(track.artistId) + track -> optArtistName + } +``` + +See also @ref[Large Hash Joins](../Joins.md#large-hash-join), which do the same thing as this simple example but with a more compact syntax. + +## Writing + +If a sparkey can be reused by multiple pipelines, it can be saved permanently with @scaladoc[asSparkey](com.spotify.scio.extra.sparkey.SparkeyPairSCollection#asSparkey(implicitw:com.spotify.scio.extra.sparkey.package.SparkeyWritable[K,V]):com.spotify.scio.values.SCollection[com.spotify.scio.extra.sparkey.SparkeyUri]) + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection +import com.spotify.scio.extra.sparkey._ + +val elements: SCollection[(String, String)] = ??? +elements.asSparkey("gs://output-path") +``` + +## Reading + +Previously-written sparkeys can be loaded directly as side-inputs: + +```scala mdoc:compile-only +import com.spotify.scio.ScioContext +import com.spotify.scio.values.{SCollection, SideInput} +import com.spotify.scio.extra.sparkey._ +import com.spotify.sparkey._ + +val sc: ScioContext = ??? +val sparkeySI: SideInput[SparkeyReader] = sc.sparkeySideInput("gs://input-path") +``` diff --git a/site/src/main/paradox/extras/Transforms.md b/site/src/main/paradox/extras/Transforms.md new file mode 100644 index 0000000000..0eb9e67d53 --- /dev/null +++ b/site/src/main/paradox/extras/Transforms.md @@ -0,0 +1,106 @@ +# Transforms + +The `com.spotify.scio.transforms` package provides a selection of transforms with additional functionality. + +# WithResource + +The @scaladoc[WithResource](com.spotify.scio.transforms.syntax.SCollectionWithResourceSyntax.SCollectionWithResourceFunctions) syntax provides a convenient wrapper around @scaladoc[DoFnWithResource](com.spotify.scio.transforms.DoFnWithResource) that allows reuse of some resource class, for example an API client, according to the specified @scaladoc[ResourceType](com.spotify.scio.transforms.DoFnWithResource.ResourceType) behavior for variants of `map`, `filter`, `flatMap`, and `collect`: + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection +import com.spotify.scio.transforms._ +import com.spotify.scio.transforms.DoFnWithResource.ResourceType + +class Client(val name: String) +class ClientNotThreadSafe() { + private var state: Int = 0 + def name(): String = { + val out = s"c$state" + state = state + 1 + out + } +} + +val elements: SCollection[String] = ??? + +elements.mapWithResource(new Client("c1"), ResourceType.PER_CLASS) { + case (client, s) => s + client.name +} +elements.filterWithResource(new Client("c2"), ResourceType.PER_INSTANCE) { + case (client, s) => s.nonEmpty +} +elements.collectWithResource(new Client("c3"), ResourceType.PER_INSTANCE) { + case (client, s) if s.nonEmpty => s + client.name +} +elements.flatMapWithResource(new ClientNotThreadSafe(), ResourceType.PER_CLONE) { + case (client, s) => s + client.name() +} +``` + +## Custom Parallelism + +By default, a worker on dataflow batch pipeline will have a number of threads equal to the number of vCPUs. +In dataflow streaming, the [default number of threads is 300](https://github.com/apache/beam/blob/98210d99b8530346b66fcffe66b893924c910bea/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java#L181). + +To limit the number of concurrent items being processed a worker, @scaladoc[CustomParallelism](com.spotify.scio.transforms.syntax.SCollectionParallelismSyntax.CustomParallelismSCollection) syntax allows setting a `parallelism` argument on variants of `map`, `filter`, `flatMap`, and `collect`: + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection +import com.spotify.scio.transforms._ + +val elements: SCollection[String] = ??? +elements.mapWithParallelism(5) { s => s + "_append" } +elements.filterWithParallelism(5) { s => s.nonEmpty } +elements.flatMapWithParallelism(5) { s => s.split(",") } +elements.collectWithParallelism(5) { case s if s.nonEmpty => s + "_append" } +``` + +# FileDownload + +The @scaladoc[FileDownload](com.spotify.scio.transforms.syntax.SCollectionFileDownloadSyntax.FileDownloadSCollection) syntax provides support for downloading arbitrary `URI`s to a local file, then handling the results: + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection +import com.spotify.scio.transforms._ +import scala.jdk.CollectionConverters._ +import java.net.URI +import java.nio.file.Files +import java.nio.charset.StandardCharsets + +val uris: SCollection[URI] = ??? +val fileContents: SCollection[String] = uris.mapFile { path => + new String(Files.readAllBytes(path), StandardCharsets.UTF_8) +} +val lines: SCollection[String] = uris.flatMapFile { path => + Files.readAllLines(path).asScala +} +``` + +# Safe flatMap + +The @scaladoc[Safe](com.spotify.scio.transforms.syntax.SCollectionSafeSyntax.SpecializedFlatMapSCollection) syntax provides a `safeFlatMap` function that captures any exceptions thrown by the body of the transform and partitions its output into an `SCollection` of successfully-output elements and an `SCollection` of the exception-throwing input elements and the `Throwable` they produced. + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection +import com.spotify.scio.transforms._ + +val elements: SCollection[String] = ??? +val (ok: SCollection[Int], bad: SCollection[(String, Throwable)]) = elements + .safeFlatMap { in => + in.split(",").map { s => s.toInt } + } +``` + +# Pipe + +The @scaladoc[Pipe](com.spotify.scio.transforms.syntax.SCollectionPipeSyntax.PipeSCollection) syntax provides a method to pass elements of an `SCollection[String]` to a specified command-line program. +Additional arguments allow configuration of the working directory, application environment, and setup & teardown commands. + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection +import com.spotify.scio.transforms._ + +val elements: SCollection[String] = ??? +val upperElements: SCollection[String] = elements.pipe("tr [:lower:] [:upper:]") +``` + diff --git a/site/src/main/paradox/extras/index.md b/site/src/main/paradox/extras/index.md index 692ff34681..dba8a6656d 100644 --- a/site/src/main/paradox/extras/index.md +++ b/site/src/main/paradox/extras/index.md @@ -5,7 +5,17 @@ @@@ index * @ref:[Algebird](Algebird.md) +* @ref:[Annoy](Annoy.md) +* @ref:[AsyncDoFn](AsyncDoFn.md) +* @ref:[BigQuery Avro Converters](BigQueryAvro.md) +* @ref:[DistCache](DistCache.md) +* @ref:[Fanout](Fanout.md) * @ref:[HyperLogLog](HyperLogLog.md) +* @ref:[MutableScalableBloomFilter](MutableScalableBloomFilter.md) +* @ref:[Sorter](Sorter.md) * @ref:[Sort Merge Bucket](Sort-Merge-Bucket.md) +* @ref:[Sparkey](Sparkey.md) +* @ref:[REPL](Scio-REPL.md) +* @ref:[Transforms](Transforms.md) @@@ diff --git a/site/src/main/paradox/index.md b/site/src/main/paradox/index.md index b69d483b40..9daf9056dc 100644 --- a/site/src/main/paradox/index.md +++ b/site/src/main/paradox/index.md @@ -7,7 +7,7 @@ Scio is a Scala API for [Apache Beam](https://beam.apache.org/) and [Google Cloud Dataflow](https://github.com/GoogleCloudPlatform/DataflowJavaSDK) inspired by [Apache Spark](https://spark.apache.org/) and [Scalding](https://github.com/twitter/scalding). -@ref:[Getting Started](Getting-Started.md) is the best place to start with Scio. If you are new to Apache Beam and distributed data processing, check out the [Beam Programming Guide](https://beam.apache.org/documentation/programming-guide/) first for a detailed explanation of the Beam programming model and concepts. If you have experience with other Scala data processing libraries, check out this comparison between [[Scio, Scalding and Spark]]. Finally check out this document about the relationship between [[Scio, Beam and Dataflow]]. +@ref:[Getting Started](Getting-Started.md) is the best place to start with Scio. If you are new to Apache Beam and distributed data processing, check out the [Beam Programming Guide](https://beam.apache.org/documentation/programming-guide/) first for a detailed explanation of the Beam programming model and concepts. If you have experience with other Scala data processing libraries, check out this comparison between [[Scio, Scalding and Spark]]. Example Scio pipelines and tests can be found under @github[scio-examples](/scio-examples/src). A lot of them are direct ports from Beam's Java [examples](https://github.com/apache/beam/tree/master/examples). See this [page](https://spotify.github.io/scio/examples/) for some of them with side-by-side explanation. Also see [Big Data Rosetta Code](https://github.com/spotify/big-data-rosetta-code) for common data processing code snippets in Scio, Scalding and Spark. @@ -20,85 +20,80 @@ See @scaladoc[Scio Scaladocs](com.spotify.scio.index) for current API documenta ## Documentation - @ref:[Getting Started](Getting-Started.md) - current API documentation -- @ref:[Scio REPL](Scio-REPL.md) - tutorial for the interactive Scio REPL -- @ref:[Scio, Beam and Dataflow](Scio,-Beam-and-Dataflow.md) - how Scio concepts map to Beam and Dataflow - @ref:[Scio, Scalding and Spark](Scio,-Scalding-and-Spark.md) - comparison of these frameworks - @ref:[Runners](Runners.md) - how Scio handles Beam runners and runner specific logic - @ref:[Scio data guideline](Scio-data-guideline.md) - guideline for common problems -- @ref:[Apache Beam](Apache-Beam.md) - notes on Apache Beam compatibility - @ref:[Releases](releases/index.md) - Detailed release notes on new Scio releases - @ref:[FAQ](FAQ.md) - frequently asked questions -- @ref:[Powered By](Powered-By.md) - see who is using Scio in production ### IO - - @ref:[Avro](io/Avro.md) - using Scio with Avro files - - @ref:[BigQuery](io/BigQuery.md) - using Scio with BigQuery in a type safe way - - @ref:[Bigtable](io/Bigtable.md) - using Scio with Bigtable - - @ref:[Parquet](io/Parquet.md) - using Scio with Parquet files - - @ref:[Protobuf](io/Protobuf.md) - using Scio with Protobuf +- @ref:[Avro](io/Avro.md) - using Scio with Avro files +- @ref:[BigQuery](io/BigQuery.md) - using Scio with BigQuery in a type safe way +- @ref:[Bigtable](io/Bigtable.md) - using Scio with Bigtable +- @ref:[Parquet](io/Parquet.md) - using Scio with Parquet files +- @ref:[Protobuf](io/Protobuf.md) - using Scio with Protobuf ### Extras - - @ref:[Algebird](extras/Algebird.md) - - @ref:[Sort Merge Bucket](extras/Sort-Merge-Bucket.md) +- @ref:[Algebird](extras/Algebird.md) +- @ref:[Sort Merge Bucket](extras/Sort-Merge-Bucket.md) ### Internals - - @ref:[ScioIO](internals/ScioIO.md) - new IO system to simplify implementation and stubbing in `JobTest` - - @ref:[OverrideTypeProvider](internals/OverrideTypeProvider.md) - custom mappings for type-safe BigQuery - - @ref:[Kryo](internals/Kryo.md) - [Kryo](https://github.com/EsotericSoftware/kryo) data serialization - - @ref:[Coders](internals/Coders.md) - new [Magnolia](https://github.com/softwaremill/magnolia) based Coders derivation +- @ref:[ScioIO](internals/ScioIO.md) - new IO system to simplify implementation and stubbing in `JobTest` +- @ref:[OverrideTypeProvider](internals/OverrideTypeProvider.md) - custom mappings for type-safe BigQuery +- @ref:[Kryo](internals/Kryo.md) - [Kryo](https://github.com/EsotericSoftware/kryo) data serialization +- @ref:[Coders](internals/Coders.md) - new [Magnolia](https://github.com/softwaremill/magnolia) based Coders derivation ## Further Readings - - [Spotify Unwrapped: How We Brought You a Decade of Data](https://engineering.atspotify.com/2020/02/18/spotify-unwrapped-how-we-brought-you-a-decade-of-data/) - - [Scio 0.7: a Deep Dive](https://engineering.atspotify.com/2019/05/30/scio-0-7-a-deep-dive/) - - [Big Data Processing at Spotify: The Road to Scio (Part 1)](https://labs.spotify.com/2017/10/16/big-data-processing-at-spotify-the-road-to-scio-part-1/) - - [Big Data Processing at Spotify: The Road to Scio (Part 2)](https://labs.spotify.com/2017/10/23/big-data-processing-at-spotify-the-road-to-scio-part-2/) - - [The world beyond batch: Streaming 101](https://www.oreilly.com/ideas/the-world-beyond-batch-streaming-101) - - [The world beyond batch: Streaming 102](https://www.oreilly.com/ideas/the-world-beyond-batch-streaming-102) - - [Dataflow/Beam & Spark: A Programming Model Comparison](https://cloud.google.com/dataflow/blog/dataflow-beam-and-spark-comparison) - - [VLDB paper](http://www.vldb.org/pvldb/vol8/p1792-Akidau.pdf) on the Dataflow Model +- [Spotify Unwrapped: How We Brought You a Decade of Data](https://engineering.atspotify.com/2020/02/18/spotify-unwrapped-how-we-brought-you-a-decade-of-data/) +- [Scio 0.7: a Deep Dive](https://engineering.atspotify.com/2019/05/30/scio-0-7-a-deep-dive/) +- [Big Data Processing at Spotify: The Road to Scio (Part 1)](https://labs.spotify.com/2017/10/16/big-data-processing-at-spotify-the-road-to-scio-part-1/) +- [Big Data Processing at Spotify: The Road to Scio (Part 2)](https://labs.spotify.com/2017/10/23/big-data-processing-at-spotify-the-road-to-scio-part-2/) +- [The world beyond batch: Streaming 101](https://www.oreilly.com/ideas/the-world-beyond-batch-streaming-101) +- [The world beyond batch: Streaming 102](https://www.oreilly.com/ideas/the-world-beyond-batch-streaming-102) +- [Dataflow/Beam & Spark: A Programming Model Comparison](https://cloud.google.com/dataflow/blog/dataflow-beam-and-spark-comparison) +- [VLDB paper](http://www.vldb.org/pvldb/vol8/p1792-Akidau.pdf) on the Dataflow Model ## Presentations - - [Techbytes: Data Processing with Scio](https://engineering.atspotify.com/2019/10/16/techbytes-data-processing-with-scio/) - Spotify Engineering Talk, 2019 - - [Techbytes: Handling Big Data at Spotify](https://engineering.atspotify.com/2019/10/16/techbytes-handling-big-data-at-spotify/) - Spotify Engineering Talk, 2019 - - [Scio - Big Data on Google Cloud with Scala and Scio](https://docs.google.com/presentation/d/1F02Lwnqm9H3cGqDQhIZ3gbftyLQSnVMRxX69H_d04OE/edit#slide=id.p4) - Apache Beam Summit London 2018 Talk - - [Sorry - How Bieber broke Google Cloud at Spotify](https://www.youtube.com/watch?v=1dchSsac3T4) ([slides](https://www.slideshare.net/sinisalyh/sorry-how-bieber-broke-google-cloud-at-spotify)) - Scala Up North 2017 Talk - - [Scio - Moving to Google Cloud A Spotify Story](https://www.infoq.com/presentations/scio) ([slides](https://www.slideshare.net/sinisalyh/scio-moving-to-google-cloud-a-spotify-story)) - Philly ETE 2017 Talk - - [Scio - A Scala API for Google Cloud Dataflow & Apache Beam](https://www.youtube.com/watch?v=4wDwVgODyAg) ([slides](https://www.slideshare.net/sinisalyh/scio-a-scala-api-for-google-cloud-dataflow-apache-beam)) - Scala by the Bay 2016 Talk - - [From stream to recommendation with Cloud Pub/Sub and Cloud Dataflow](https://www.youtube.com/watch?v=xT6tQAIywFQ) - GCP NEXT 16 Talk - - [Apache Beam Presentation Materials](https://beam.apache.org/contribute/presentation-materials/) +- [Scio in Depth](https://www.youtube.com/watch?v=cGvaQp_h5ek) - Apache Beam Summit, 2022 +- [Techbytes: Data Processing with Scio](https://engineering.atspotify.com/2019/10/16/techbytes-data-processing-with-scio/) - Spotify Engineering Talk, 2019 +- [Techbytes: Handling Big Data at Spotify](https://engineering.atspotify.com/2019/10/16/techbytes-handling-big-data-at-spotify/) - Spotify Engineering Talk, 2019 +- [Scio - Big Data on Google Cloud with Scala and Scio](https://docs.google.com/presentation/d/1F02Lwnqm9H3cGqDQhIZ3gbftyLQSnVMRxX69H_d04OE/edit#slide=id.p4) - Apache Beam Summit London 2018 Talk +- [Sorry - How Bieber broke Google Cloud at Spotify](https://www.youtube.com/watch?v=1dchSsac3T4) ([slides](https://www.slideshare.net/sinisalyh/sorry-how-bieber-broke-google-cloud-at-spotify)) - Scala Up North 2017 Talk +- [Scio - Moving to Google Cloud A Spotify Story](https://www.infoq.com/presentations/scio) ([slides](https://www.slideshare.net/sinisalyh/scio-moving-to-google-cloud-a-spotify-story)) - Philly ETE 2017 Talk +- [Scio - A Scala API for Google Cloud Dataflow & Apache Beam](https://www.youtube.com/watch?v=4wDwVgODyAg) ([slides](https://www.slideshare.net/sinisalyh/scio-a-scala-api-for-google-cloud-dataflow-apache-beam)) - Scala by the Bay 2016 Talk +- [From stream to recommendation with Cloud Pub/Sub and Cloud Dataflow](https://www.youtube.com/watch?v=xT6tQAIywFQ) - GCP NEXT 16 Talk +- [Apache Beam Presentation Materials](https://beam.apache.org/contribute/presentation-materials/) ## Projects using or related to Scio - - [Featran](https://github.com/spotify/featran) - A Scala feature transformation library for data science and machine learning - - [Big Data Rosetta Code](https://github.com/spotify/big-data-rosetta-code) - Code snippets for solving common big data problems in various platforms. Inspired by [Rosetta Code](https://rosettacode.org/) - - [Ratatool](https://github.com/spotify/ratatool) - A tool for random data sampling and generation, which includes [BigDiffy](https://github.com/spotify/ratatool/blob/master/ratatool-diffy/src/main/scala/com/spotify/ratatool/diffy/BigDiffy.scala), a Scio library for pairwise field-level statistical diff of data sets ([slides](http://www.lyh.me/slides/bigdiffy.html)) - - [Elitzur](https://github.com/spotify/elitzur) - Data validation for Scala and Scio - - [Scio Koans](https://github.com/nevillelyh/scio-koans/) - A collection of Scio exercises inspired by [Ruby Koans](http://rubykoans.com/) and many others. - - [scio-deep-dive](https://github.com/nevillelyh/scio-deep-dive) - Building Scio from scratch step by step for an internal training session - - [Klio](https://github.com/spotify/klio) - Large scale audio or binary file processing with Python and Apache Beam - - [scala-flow](https://github.com/zendesk/scala-flow) - A lightweight Scala wrapper for Google Cloud Dataflow from Zendesk - - [clj-headlights](https://github.com/zendesk/clj-headlights) - Clojure API for Apache Beam, also from Zendesk - - [datasplash](https://github.com/ngrunwald/datasplash) - A Clojure API for Google Cloud Dataflow +- [Featran](https://github.com/spotify/featran) - A Scala feature transformation library for data science and machine learning +- [Big Data Rosetta Code](https://github.com/spotify/big-data-rosetta-code) - Code snippets for solving common big data problems in various platforms. Inspired by [Rosetta Code](https://rosettacode.org/) +- [Ratatool](https://github.com/spotify/ratatool) - A tool for random data sampling and generation, which includes [BigDiffy](https://github.com/spotify/ratatool/blob/master/ratatool-diffy/src/main/scala/com/spotify/ratatool/diffy/BigDiffy.scala), a Scio library for pairwise field-level statistical diff of data sets ([slides](http://www.lyh.me/slides/bigdiffy.html)) +- [Elitzur](https://github.com/spotify/elitzur) - Data validation for Scala and Scio +- [Scio Koans](https://github.com/nevillelyh/scio-koans/) - A collection of Scio exercises inspired by [Ruby Koans](http://rubykoans.com/) and many others. +- [scio-deep-dive](https://github.com/nevillelyh/scio-deep-dive) - Building Scio from scratch step by step for an internal training session +- [Klio](https://github.com/spotify/klio) - Large scale audio or binary file processing with Python and Apache Beam +- [scala-flow](https://github.com/zendesk/scala-flow) - A lightweight Scala wrapper for Google Cloud Dataflow from Zendesk +- [clj-headlights](https://github.com/zendesk/clj-headlights) - Clojure API for Apache Beam, also from Zendesk +- [datasplash](https://github.com/ngrunwald/datasplash) - A Clojure API for Google Cloud Dataflow @@@ index * @ref:[Getting Started](Getting-Started.md) -* @ref:[Examples](examples.md) +* @ref:[Built-in Functionality](Builtin.md) +* @ref:[Joins](Joins.md) +* @ref:[SideInputs](SideInputs.md) * @ref:[IO](io/index.md) +* @ref:[Examples](examples.md) * @ref:[Testing](Scio-Unit-Tests.md) -* @ref:[REPL](Scio-REPL.md) * @ref:[Internals](internals/index.md) * @ref:[Extras](extras/index.md) -* @ref:[Migration guides](migrations/index.md) * @ref:[Development](dev/index.md) * @ref:[Scaladoc](scaladoc.md) -* @ref:[Scio, Beam and Dataflow](Scio,-Beam-and-Dataflow.md) * @ref:[Scio, Scalding and Spark](Scio,-Scalding-and-Spark.md) * @ref:[Runners](Runners.md) * @ref:[Data guideline](Scio-data-guideline.md) -* @ref:[Beam Compatibility Guide](Apache-Beam.md) * @ref:[Releases](releases/index.md) * @ref:[FAQ](FAQ.md) -* @ref:[Powered By](Powered-By.md) @@@ diff --git a/site/src/main/paradox/internals/Coders.md b/site/src/main/paradox/internals/Coders.md index 4a88429ead..65c450c964 100644 --- a/site/src/main/paradox/internals/Coders.md +++ b/site/src/main/paradox/internals/Coders.md @@ -7,7 +7,7 @@ As per [Beam's documentation](https://beam.apache.org/documentation/programming- > When Beam runners execute your pipeline, they often need to materialize the intermediate data in your PCollections, which requires converting elements to and from byte strings. The Beam SDKs use objects called Coders to describe how the elements of a given PCollection may be encoded and decoded. For the most part, coders are used when Beam transfer intermediate data between workers over the network. They may also be used by beam to test instances for equality. -Anytime you create a `SCollection[T]`, Beam needs to know how to go from an instance of `T` to an array of bytes, and from that array of bytes to an instance of `T`. +Anytime you create an `SCollection[T]`, Beam needs to know how to go from an instance of `T` to an array of bytes, and from that array of bytes to an instance of `T`. The Beam SDK defines a class called `Coder` that roughly looks like this: @@ -18,7 +18,7 @@ public abstract class Coder implements Serializable { } ``` -Beam provides built-in Coders for various basic Java types (`Integer`, `Long`, `Double`, etc.). But anytime you create a new class, and that class is used in a `SCollection`, a beam coder needs to be provided. +Beam provides built-in Coders for various basic Java types (`Integer`, `Long`, `Double`, etc.). But anytime you create a new class, and that class is used in an `SCollection`, a beam coder needs to be provided. ```scala mdoc:silent import com.spotify.scio.values.SCollection @@ -105,10 +105,10 @@ Scio `Coder` and its implementations simply form an [ADT](https://en.wikipedia.o There is also a "special" coder called `KVCoder`. It is a specific coder for Key-Value pairs. Internally Beam treats @javadoc[KV](org.apache.beam.sdk.values.KV) differently from other types so Scio needs to do the same. -It is important to note that **Scio's coders are only representations** of those cases but **do not actually implement any serialization logic**. Before the job starts, those coders will be *materialized*, meaning they will be converted to instances of @javadoc[`org.apache.beam.sdk.coders.Coder`](org.apache.beam.sdk.coders.Coder). +It is important to note that **Scio's coders are only representations** of those cases but **do not actually implement any serialization logic**. Before the job starts, those coders will be *materialized*, meaning they will be converted to instances of @javadoc[org.apache.beam.sdk.coders.Coder](org.apache.beam.sdk.coders.Coder). Thanks to this technique, Scio can dynamically change the behavior of coders depending on the execution context. For example coders may handle nullable values differently depending on options passed to the job. -@javadoc[`org.apache.beam.sdk.coders.Coder`](org.apache.beam.sdk.coders.Coder) instances on the other hand are the actual implementations of serialization and deserialization logic. Among other thing, each instance of `org.apache.beam.sdk.coders.Coder[T]` defines two methods: +@javadoc[org.apache.beam.sdk.coders.Coder](org.apache.beam.sdk.coders.Coder) instances on the other hand are the actual implementations of serialization and deserialization logic. Among other thing, each instance of `org.apache.beam.sdk.coders.Coder[T]` defines two methods: ```scala class ExampleCoder extends org.apache.beam.sdk.coders.Coder[Example] { @@ -256,7 +256,7 @@ scalacOptions += "-Xmacro-settings:show-coder-fallback=true" ## How to build a custom Coder -It is possible for the user to define their own `Coder` implementation. Scio provides [builder functions](https://spotify.github.io/scio/api/com/spotify/scio/coders/CoderGrammar.html) in the `Coder` object. If you want to create a custom `Coder`, you should use one of the those three builder: +It is possible for the user to define their own `Coder` implementation. Scio provides @scaladoc[builder functions](com.spotify.scio.coders.CoderGrammar) in the `Coder` object. If you want to create a custom `Coder`, you should use one of the those three builder: - **`Coder.beam`**: Create a Scio `Coder` that simply wraps a Beam implementation. For example: ```scala mdoc @@ -288,11 +288,11 @@ Note that in test mode (when you use `JobTest`), Scio will make sure that all th ### Testing custom coders -Scio provides a few assertions specific to coders. See [CoderAssertions](https://spotify.github.io/scio/api/com/spotify/scio/testing/CoderAssertions$.html). +Scio provides a few assertions specific to coders. See @scaladoc[CoderAssertions](com.spotify.scio.testing.CoderAssertions$). ## Null values support -By default and for performance reasons, Scio coders will expect the values to serialized to never be `null`. +By default, and for performance reasons, Scio coders will expect the values to serialized to never be `null`. This may cause the following exception to be thrown: @@ -316,4 +316,4 @@ There are 2 ways to fix this issue: ## Upgrading to `v0.7.0` or above: Migrating to static coder Migrating to Scio `0.7.x` from an older version is likely to break a few things at compile time in your project. -See the complete @ref:[v0.7.0 Migration Guide](../migrations/v0.7.0-Migration-Guide.md) for more information. +See the complete @ref:[v0.7.0 Migration Guide](../releases/migrations/v0.7.0-Migration-Guide.md) for more information. diff --git a/site/src/main/paradox/internals/OverrideTypeProvider.md b/site/src/main/paradox/internals/OverrideTypeProvider.md index 6e4fc0edd9..1727674316 100644 --- a/site/src/main/paradox/internals/OverrideTypeProvider.md +++ b/site/src/main/paradox/internals/OverrideTypeProvider.md @@ -1,8 +1,9 @@ # OverrideTypeProvider -The @scaladoc[`OverrideTypeProvider`](com.spotify.scio.bigquery.validation.OverrideTypeProvider) trait allows the user to provide custom mappings from BigQuery types to custom Scala types. +The @scaladoc[OverrideTypeProvider](com.spotify.scio.bigquery.validation.OverrideTypeProvider) trait allows the user to provide custom mappings from BigQuery types to custom Scala types. This can be used for a number of use cases: + * Using higher level types in Scio in order to be explicit about what your data is * Custom code can be run when you create new objects to do things like data validation or simple transformation diff --git a/site/src/main/paradox/internals/ScioIO.md b/site/src/main/paradox/internals/ScioIO.md index f1ca9a3f87..f373ddf223 100644 --- a/site/src/main/paradox/internals/ScioIO.md +++ b/site/src/main/paradox/internals/ScioIO.md @@ -1,6 +1,6 @@ # ScioIO -Scio `0.7.0` introduces a new @scaladoc[`ScioIO[T]`](com.spotify.scio.io.ScioIO) trait to simplify IO implementation and stubbing in `JobTest`. This page lists some major changes to this new API. +Scio `0.7.0` introduces a new @scaladoc[ScioIO[T]](com.spotify.scio.io.ScioIO) trait to simplify IO implementation and stubbing in `JobTest`. This page lists some major changes to this new API. ## Dependencies diff --git a/site/src/main/paradox/io/Avro.md b/site/src/main/paradox/io/Avro.md index acf1c18f72..4cdcd225de 100644 --- a/site/src/main/paradox/io/Avro.md +++ b/site/src/main/paradox/io/Avro.md @@ -6,13 +6,13 @@ Scio comes with support for reading Avro files. Avro supports generic or specifi ### Read Specific records -```scala mdoc:reset:silent +```scala mdoc:compile-only import com.spotify.scio.ScioContext import com.spotify.scio.avro._ import org.apache.avro.specific.SpecificRecord -def sc: ScioContext = ??? +val sc: ScioContext = ??? // SpecificRecordClass is compiled from Avro schema files def result = sc.avroFile[SpecificRecord]("gs://path-to-data/lake/part-*.avro") @@ -20,7 +20,7 @@ def result = sc.avroFile[SpecificRecord]("gs://path-to-data/lake/part-*.avro") ### Read Generic records -```scala mdoc:reset:silent +```scala mdoc:compile-only import com.spotify.scio.ScioContext import com.spotify.scio.avro._ @@ -29,7 +29,7 @@ import org.apache.avro.Schema def yourAvroSchema: Schema = ??? -def sc: ScioContext = ??? +val sc: ScioContext = ??? def result = sc.avroFile("gs://path-to-data/lake/part-*.avro", yourAvroSchema) // `record` is of GenericRecord type @@ -41,14 +41,14 @@ Scio comes with support for writing Avro files. Avro supports generic or specifi ### Write Specific records -```scala mdoc:reset:silent +```scala mdoc:compile-only import com.spotify.scio.values.SCollection import com.spotify.scio.avro._ import org.apache.avro.specific.SpecificRecord case class Foo(x: Int, s: String) -def sc: SCollection[Foo] = ??? +val sc: SCollection[Foo] = ??? // convert to avro SpecificRecord def fn(f: Foo): SpecificRecord = ??? @@ -61,7 +61,7 @@ def result = sc.map(fn).saveAsAvroFile("gs://path-to-data/lake/output") ### Write Generic records -```scala mdoc:reset:silent +```scala mdoc:compile-only import com.spotify.scio.values.SCollection import com.spotify.scio.avro._ @@ -69,7 +69,7 @@ import org.apache.avro.generic.GenericRecord import org.apache.avro.Schema case class Foo(x: Int, s: String) -def sc: SCollection[Foo] = ??? +val sc: SCollection[Foo] = ??? def yourAvroSchema: Schema = ??? @@ -85,8 +85,8 @@ def result = sc.map(fn).saveAsAvroFile("gs://path-to-data/lake/output", schema * Unless impossible, provide default values for your fields. * New field must have a default value. * You can only delete field which has default value. -* Do not change data type of an existing fields. If needed add a new field to the schema. -* Do not rename existing fields. If needed use aliases. +* Do not change the data type of existing fields. If needed, add a new field to the schema. +* Do not rename existing fields. If needed, use aliases. ## Common issues/guidelines diff --git a/site/src/main/paradox/io/BigQuery.md b/site/src/main/paradox/io/BigQuery.md index 44bbce57dd..7bf3b718cf 100644 --- a/site/src/main/paradox/io/BigQuery.md +++ b/site/src/main/paradox/io/BigQuery.md @@ -6,16 +6,16 @@ ### TableRow -BigQuery rows are represented as [`TableRow`](https://developers.google.com/resources/api-libraries/documentation/bigquery/v2/java/latest/com/google/api/services/bigquery/model/TableRow.html) in the BigQuery Java API which is basically a `Map`. Fields are accessed by name strings and values must be cast or converted to the desired type, both of which are error prone process. +BigQuery rows are represented as @javadoc[TableRow](com.google.api.services.bigquery.model.TableRow) in the BigQuery Java API which is basically a `Map`. Fields are accessed by name strings and values must be cast or converted to the desired type, both of which are error prone process. ### Type safe BigQuery -The type safe BigQuery API in Scio represents rows as case classes and generates [`TableSchema`](https://developers.google.com/resources/api-libraries/documentation/bigquery/v2/java/latest/com/google/api/services/bigquery/model/TableSchema.html) converters automatically at compile time with the following mapping logic: +The type safe BigQuery API in Scio represents rows as case classes and generates @javadoc[TableSchema](com.google.api.services.bigquery.model.TableSchema) converters automatically at compile time with the following mapping logic: - Nullable fields are mapped to `Option[T]`s - Repeated fields are mapped to `List[T]`s - Records are mapped to nested case classes -- Timestamps are mapped to Joda Time [`Instant`](http://www.joda.org/joda-time/apidocs/org/joda/time/class-use/Instant.html) +- Timestamps are mapped to Joda Time @javadoc[Instant](org.joda.time.Instant) See documentation for @scaladoc[BigQueryType](com.spotify.scio.bigquery.types.BigQueryType$) for the complete list of supported types. diff --git a/site/src/main/paradox/io/Binary.md b/site/src/main/paradox/io/Binary.md new file mode 100644 index 0000000000..bdf3e0b4fd --- /dev/null +++ b/site/src/main/paradox/io/Binary.md @@ -0,0 +1,34 @@ +# Binary + +## Read Binary files + +See @ref:[read as binary](ReadFiles.md#read-as-binary) for reading an entire file as a binary record. + +## Write Binary files + +Binary writes are supported on `SCollection[Array[Byte]]` with the @scaladoc[saveAsBinaryFile](com.spotify.scio.values.SCollection#saveAsBinaryFile(path:String,numShards:Int,prefix:String,suffix:String,compression:org.apache.beam.sdk.io.Compression,header:Array[Byte],footer:Array[Byte],shardNameTemplate:String,framePrefix:Array[Byte]=%3EArray[Byte],frameSuffix:Array[Byte]=%3EArray[Byte],tempDirectory:String,filenamePolicySupplier:com.spotify.scio.util.FilenamePolicySupplier)(implicitev:T%3C:%3CArray[Byte]):com.spotify.scio.io.ClosedTap[Nothing]) method: + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection + +val byteArrays: SCollection[Array[Byte]] = ??? +byteArrays.saveAsBinaryFile("gs://") +``` + +A static `header` and `footer` argument are provided, along with the framing parameters `framePrefix` and `frameSuffix`: + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection +import java.nio.ByteBuffer + +val byteArrays: SCollection[Array[Byte]] = ??? +byteArrays.saveAsBinaryFile( + "gs://", + header = Array(1, 2, 3), + footer = Array(4, 5, 6), + framePrefix = arr => ByteBuffer.allocate(4).putInt(arr.length).array(), + frameSuffix = _ => Array(0) +) +``` + +See also the @ref:[object file format](Object.md), which saves binary data in an avro container. diff --git a/site/src/main/paradox/io/Cassandra.md b/site/src/main/paradox/io/Cassandra.md new file mode 100644 index 0000000000..77858b471d --- /dev/null +++ b/site/src/main/paradox/io/Cassandra.md @@ -0,0 +1,20 @@ +# Cassandra + +Scio supports writing to [Cassandra](https://cassandra.apache.org/) + +@scaladoc[saveAsCassandra](com.spotify.scio.cassandra.CassandraSCollection#saveAsCassandra(opts:com.spotify.scio.cassandra.CassandraOptions,parallelism:Int)(f:T=%3ESeq[Any]):com.spotify.scio.io.ClosedTap[Nothing]) performs bulk writes, grouping by the table partition key before writing to the cluster. + +The bulk writer writes to all nodes in a cluster so remote nodes in a multi-datacenter cluster may become a bottleneck. + +```scala mdoc:compile-only +import com.spotify.scio._ +import com.spotify.scio.values.SCollection +import com.spotify.scio.cassandra._ + +val host: String = ??? + +val cql = "INSERT INTO myKeyspace.myTable (key, value1) VALUES (?, ?)" +val opts = CassandraOptions("myKeyspace", "myTable", cql, host) +val elements: SCollection[(String, Int)] = ??? +elements.saveAsCassandra(opts) { case (key, value) => Seq(key, value) } +``` diff --git a/site/src/main/paradox/io/Csv.md b/site/src/main/paradox/io/Csv.md new file mode 100644 index 0000000000..6d37c52419 --- /dev/null +++ b/site/src/main/paradox/io/Csv.md @@ -0,0 +1,99 @@ +# CSV + +Scio supports reading and writing _typed_ CSV via [kantan](https://nrinaudo.github.io/kantan.csv/) + +Kantan provides a @scaladoc[CsvConfiguration](kantan.csv.CsvConfiguration) that allows users to configure the CSV handling, Scio's default config: + +```scala +import kantan.csv._ +import kantan.csv.CsvConfiguration.{Header, QuotePolicy} + +CsvConfiguration( + cellSeparator = ',', + quote = '"', + quotePolicy = QuotePolicy.WhenNeeded, + header = Header.Implicit +) +``` + +## Read CSV + +FIXME this csvFile link is incorrectly getting two $$ +Reading CSV is supported via @scaladoc[csvFile](com.spotify.scio.extra.csv.syntax.ScioContextSyntax.CsvScioContext#csvFile[T](path:String,params:com.spotify.scio.extra.csv.CsvIO.ReadParam)(implicitevidence$1:kantan.csv.HeaderDecoder[T],implicitevidence$2:com.spotify.scio.coders.Coder[T]):com.spotify.scio.values.SCollection[T]). +Note that the entire file must be read into memory since CSVs are not trivially splittable. + +### Read with a header + +For CSV files with a header, reading requires an implicit @scaladoc[HeaderDecoder](kantan.csv.HeaderDecoder) for your type. + +```scala mdoc:compile-only +import com.spotify.scio.ScioContext +import com.spotify.scio.values.SCollection +import com.spotify.scio.extra.csv._ +import kantan.csv._ + +case class A(i: Int, s: String) +implicit val decoder: HeaderDecoder[A] = HeaderDecoder.decoder("col1", "col2")(A.apply) + +val sc: ScioContext = ??? +val elements: SCollection[A] = sc.csvFile("gs:///*.csv") +``` + +### Read without a header + +For CSV files without a header, an implicit @scaladoc[RowDecoder](kantan.csv.RowDecoder) must be in scope and the read must be provided with a config specifying that there is no header: + +```scala mdoc:compile-only +import com.spotify.scio.ScioContext +import com.spotify.scio.values.SCollection +import com.spotify.scio.extra.csv._ +import kantan.csv._ + +case class A(i: Int, s: String) + +implicit val decoder: RowDecoder[A] = RowDecoder.ordered { (col1: Int, col2: String) => A(col1, col2) } +val config = CsvIO.DefaultCsvConfiguration.withoutHeader + +val sc: ScioContext = ??? +val elements: SCollection[A] = sc.csvFile("gs:///*.csv", CsvIO.ReadParam(csvConfiguration = config)) +``` + +## Write CSV + +Writing to CSV is supported via @scaladoc[saveAsCsvFile](com.spotify.scio.extra.csv.syntax.SCollectionSyntax.WritableCsvSCollection#saveAsCsvFile(path:String,suffix:String,csvConfig:kantan.csv.CsvConfiguration,numShards:Int,compression:org.apache.beam.sdk.io.Compression,shardNameTemplate:String,tempDirectory:String,filenamePolicySupplier:com.spotify.scio.util.FilenamePolicySupplier)(implicitcoder:com.spotify.scio.coders.Coder[T],implicitenc:kantan.csv.HeaderEncoder[T]):com.spotify.scio.io.ClosedTap[Nothing]). + +### Write with a header + +Writing with a header requires an implicit @scaladoc[HeaderEncoder](kantan.csv.HeaderEncoder) to be in scope: + +```scala mdoc:compile-only +import com.spotify.scio.ScioContext +import com.spotify.scio.values.SCollection +import com.spotify.scio.extra.csv._ +import kantan.csv._ + +case class A(i: Int, s: String) + +implicit val encoder: HeaderEncoder[A] = HeaderEncoder.caseEncoder("col1", "col2")(A.unapply) + +val elements: SCollection[A] = ??? +elements.saveAsCsvFile("gs:///") +``` + +### Write without a header + +Writing without a header requires an implicit @scaladoc[RowEncoder](kantan.csv.RowEncoder) to be in scope: + +```scala mdoc:compile-only +import com.spotify.scio.ScioContext +import com.spotify.scio.values.SCollection +import com.spotify.scio.extra.csv._ +import kantan.csv._ + +case class A(i: Int, s: String) + +implicit val encoder: RowEncoder[A] = RowEncoder.encoder(0, 1)((a: A) => (a.i, a.s)) + +val elements: SCollection[A] = ??? +elements.saveAsCsvFile("gs:///") +``` \ No newline at end of file diff --git a/site/src/main/paradox/io/Datastore.md b/site/src/main/paradox/io/Datastore.md new file mode 100644 index 0000000000..09c4bf0087 --- /dev/null +++ b/site/src/main/paradox/io/Datastore.md @@ -0,0 +1,39 @@ +# Datastore + +Scio supports [Google Datastore](https://cloud.google.com/datastore) via Beam's @javadoc[DatastoreIO](org.apache.beam.sdk.io.gcp.datastore.DatastoreIO). + +[Magnolify's](https://github.com/spotify/magnolify) `EntityType` (available as part of the `magnolify-datastore` artifact) provides automatically-derived mappings between Datastore's `Entity` and scala case classes. See [full documentation here](https://github.com/spotify/magnolify/blob/main/docs/datastore.md) and [an example usage here](https://spotify.github.io/scio/examples/MagnolifyDatastoreExample.scala.html). + +## Reads + +Read an `SCollection` of `com.google.datastore.v1.Entity` from Datastore with @scaladoc[datastore](com.spotify.scio.datastore.syntax.ScioContextOps#datastore(projectId:String,query:com.google.datastore.v1.Query,namespace:String):com.spotify.scio.values.SCollection[com.google.datastore.v1.Entity]): + +```scala mdoc:compile-only +import com.spotify.scio._ +import com.spotify.scio.values.SCollection +import com.spotify.scio.datastore._ +import com.google.datastore.v1.{Entity, Query} + +val sc: ScioContext = ??? + +val projectId: String = ??? +val query: Query = Query.getDefaultInstance +val entities: SCollection[Entity] = sc.datastore(projectId, query) +``` + +## Writes + +Write a collection of + +@scaladoc[saveAsDatastore](com.spotify.scio.datastore.syntax.SCollectionEntityOps#saveAsDatastore(projectId:String):com.spotify.scio.io.ClosedTap[Nothing]) + +```scala mdoc:compile-only +import com.spotify.scio._ +import com.spotify.scio.values.SCollection +import com.spotify.scio.datastore._ +import com.google.datastore.v1.{Entity, Query} + +val projectId: String = ??? +val entities: SCollection[Entity] = ??? +entities.saveAsDatastore(projectId) +``` diff --git a/site/src/main/paradox/io/Elasticsearch.md b/site/src/main/paradox/io/Elasticsearch.md new file mode 100644 index 0000000000..ff721706bb --- /dev/null +++ b/site/src/main/paradox/io/Elasticsearch.md @@ -0,0 +1,56 @@ +# Elasticsearch + +Scio supports writing to [Elasticsearch](https://github.com/elastic/elasticsearch). + +## Writes + +An `SCollection` of arbitrary elements can be saved to Elasticsearch with +@scaladoc[saveAsElasticsearch](com.spotify.scio.elasticsearch.ElasticsearchSCollection#saveAsElasticsearch(esOptions:com.spotify.scio.elasticsearch.ElasticsearchOptions,flushInterval:org.joda.time.Duration,numOfShards:Long,maxBulkRequestOperations:Int,maxBulkRequestBytes:Long,errorFn:org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Write.BulkExecutionException=%3EUnit,retry:com.spotify.scio.elasticsearch.ElasticsearchIO.RetryConfig)(f:T=%3EIterable[co.elastic.clients.elasticsearch.core.bulk.BulkOperation]):com.spotify.scio.io.ClosedTap[Nothing]). +The @scaladoc[ElasticsearchOptions](com.spotify.scio.elasticsearch.ElasticsearchOptions)-typed `esOptions` argument requires a `mapperFactory` argument capable of mapping the element type to json. +`saveAsElasticsearch` takes a second argument list, whose single argument `f` can be provided as a block, and which maps the input type to Elasticsearch [BulkOperations](https://artifacts.elastic.co/javadoc/co/elastic/clients/elasticsearch-java/8.8.0/co/elastic/clients/elasticsearch/core/bulk/BulkOperation.html). + +```scala mdoc:compile-only +import com.spotify.scio._ +import com.spotify.scio.values.SCollection +import com.spotify.scio.elasticsearch._ + +import co.elastic.clients.elasticsearch.core.bulk.{BulkOperation, IndexOperation} +import co.elastic.clients.json.jackson.JacksonJsonpMapper +import com.fasterxml.jackson.module.scala.DefaultScalaModule +import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule +import org.apache.http.HttpHost +import java.time.LocalDate + +val host: String = ??? +val port: Int = ??? +val esIndex: String = ??? + +case class Document(user: String, postDate: LocalDate, word: String, count: Long) + +val primaryESHost = new HttpHost(host, port) +val mapperFactory = () => { + val mapper = new JacksonJsonpMapper() + mapper.objectMapper().registerModule(DefaultScalaModule) + mapper.objectMapper().registerModule(new JavaTimeModule()) + mapper +} +val esOptions = ElasticsearchOptions( + nodes = Seq(primaryESHost), + mapperFactory = mapperFactory +) + +val elements: SCollection[Document] = ??? +elements.saveAsElasticsearch(esOptions) { d => + List( + BulkOperation.of { bulkBuilder => + bulkBuilder.index( + IndexOperation.of[Document] { indexBuilder => + indexBuilder + .index(esIndex) + .document(d) + } + ) + } + ) +} +``` diff --git a/site/src/main/paradox/io/Grpc.md b/site/src/main/paradox/io/Grpc.md new file mode 100644 index 0000000000..7b1822f5d8 --- /dev/null +++ b/site/src/main/paradox/io/Grpc.md @@ -0,0 +1,23 @@ +# GRPC + +Scio supports lookups via [GRPC](https://grpc.io/) in the `scio-grpc` artifact. + +Given an `SCollection` of GRPC request objects (`ConcatRequest` below), @scaladoc[grpcLookup](com.spotify.scio.grpc.GrpcSCollectionOps#grpcLookup[Response,Client%3C:io.grpc.stub.AbstractFutureStub[Client]](channelSupplier:()=%3Eio.grpc.Channel,clientFactory:io.grpc.Channel=%3EClient,maxPendingRequests:Int,cacheSupplier:com.spotify.scio.transforms.BaseAsyncLookupDoFn.CacheSupplier[Request,Response])(f:Client=%3E(Request=%3Ecom.google.common.util.concurrent.ListenableFuture[Response]))(implicitevidence$1:com.spotify.scio.coders.Coder[Response]):com.spotify.scio.values.SCollection[(Request,scala.util.Try[Response])]) (or @scaladoc[grpcLookupStream](com.spotify.scio.grpc.GrpcSCollectionOps#grpcLookupStream[Response,Client%3C:io.grpc.stub.AbstractStub[Client]](channelSupplier:()=%3Eio.grpc.Channel,clientFactory:io.grpc.Channel=%3EClient,maxPendingRequests:Int,cacheSupplier:com.spotify.scio.transforms.BaseAsyncLookupDoFn.CacheSupplier[Request,Iterable[Response]])(f:Client=%3E((Request,io.grpc.stub.StreamObserver[Response])=%3EUnit))(implicitevidence$2:com.spotify.scio.coders.Coder[Response]):com.spotify.scio.values.SCollection[(Request,scala.util.Try[Iterable[Response]])]) for iterable responses) provides a concise syntax for handling responses: + +```scala +import com.spotify.scio._ +import com.spotify.scio.values.SCollection +import com.spotify.scio.grpc._ +import com.spotify.concat.v1._ +import io.grpc.netty.NettyChannelBuilder + +val ServiceUri: String = s"dns:///localhost:50051" +val maxPendingRequests = 10 +val requests: SCollection[ConcatRequest] = ??? +requests + .grpcLookup[ConcatResponse, ConcatServiceGrpc.ConcatServiceFutureStub]( + () => NettyChannelBuilder.forTarget(ServiceUri).usePlaintext().build(), + ConcatServiceGrpc.newFutureStub, + maxPendingRequests + )(_.concat) +``` diff --git a/site/src/main/paradox/io/Jdbc.md b/site/src/main/paradox/io/Jdbc.md new file mode 100644 index 0000000000..58186250dc --- /dev/null +++ b/site/src/main/paradox/io/Jdbc.md @@ -0,0 +1,91 @@ +# JDBC + +Scio supports JDBC reads and writes. + +## Reads + +Reads come in two flavors: a query-based variant backed by Beam's @javadoc[JdbcIO](org.apache.beam.sdk.io.jdbc.JdbcIO) and a "sharded select" that performs a parallelizable bulk read on an entire table. + +### Read via query + +Query-based reads are supported with @scaladoc[jdbcSelect](com.spotify.scio.jdbc.syntax.JdbcScioContextOps#jdbcSelect[T](connectionOptions:com.spotify.scio.jdbc.JdbcConnectionOptions,query:String,statementPreparator:java.sql.PreparedStatement=%3EUnit,fetchSize:Int,outputParallelization:Boolean,dataSourceProviderFn:()=%3E,javax.sql.DataSource,configOverride:org.apache.beam.sdk.io.jdbc.JdbcIO.Read[T]=%3Eorg.apache.beam.sdk.io.jdbc.JdbcIO.Read[T])(rowMapper:java.sql.ResultSet=%3ET)(implicitevidence$3:scala.reflect.ClassTag[T],implicitevidence$4:com.spotify.scio.coders.Coder[T]):com.spotify.scio.values.SCollection[T]). +It expects a @scaladoc[JdbcConnectionOptions](com.spotify.scio.jdbc.JdbcConnectionOptions) to connect to the database. +The `statementPreparator` argument may be used to set static parameters in the query, usually passed as arguments to the pipeline. +The curried `rowMapper` function argument maps between a `java.sql.ResultSet` to the result type `T`. + +```scala mdoc:compile-only +import com.spotify.scio._ +import com.spotify.scio.values.SCollection +import com.spotify.scio.jdbc._ +import java.sql.Driver + +val (sc, args): (ScioContext, Args) = ??? +val sourceArg: String = args("wordCountSourceArg") + +val jdbcUrl: String = ??? +val driverClass: Class[Driver] = ??? +val jdbcOptions = JdbcConnectionOptions("username", Some("password"), jdbcUrl, driverClass) +val query = "SELECT word, word_count FROM word_count WHERE source = ?" + +val elements: SCollection[(String, Long)] = sc.jdbcSelect(jdbcOptions, query, _.setString(1, sourceArg)) { r => + r.getString(1) -> r.getLong(2) +} +``` + +### Parallelized table read + +When an entire table is to be read, the input table can be sharded based on some column value and each shard read in parallel with @scaladoc[jdbcShardedSelect](com.spotify.scio.jdbc.syntax.JdbcScioContextOps#jdbcShardedSelect[T,S](readOptions:com.spotify.scio.jdbc.sharded.JdbcShardedReadOptions[T,S])(implicitevidence$3:com.spotify.scio.coders.Coder[T]):com.spotify.scio.values.SCollection[T]). + +@scaladoc[JdbcShardedReadOptions](com.spotify.scio.jdbc.sharded.JdbcShardedReadOptions) requires: + +* A `rowMapper` with the same function as in `jdbcSelect` +* The `tableName` of the table to be read +* A `shardColumn`, the column on which the read will be sharded. This column must be indexed and should have an index where `shardColumn` is not part of a composite index. +* A `shard` (@scaladoc[Shard](com.spotify.scio.jdbc.sharded.Shard$)) implementation for the type of `shardColumn`. Provided implementations are `Int`, `Long`, `BigDecimal`, `Double`, `Float`, @scaladoc[ShardString.HexUpperString](com.spotify.scio.jdbc.sharded.ShardString.HexUpperString), @scaladoc[ShardString.HexLowerString](com.spotify.scio.jdbc.sharded.ShardString.HexLowerString), @scaladoc[ShardString.UuidUpperString](com.spotify.scio.jdbc.sharded.ShardString.UuidUpperString), @scaladoc[ShardString.UuidLowerString](com.spotify.scio.jdbc.sharded.ShardString.UuidLowerString), and @scaladoc[ShardString.Base64String](com.spotify.scio.jdbc.sharded.ShardString.Base64String). + +```scala mdoc:compile-only +import com.spotify.scio._ +import com.spotify.scio.values.SCollection +import com.spotify.scio.jdbc._ +import java.sql.Driver +import com.spotify.scio.jdbc.sharded._ + +val sc: ScioContext = ??? + +val jdbcUrl: String = ??? +val driverClass: Class[Driver] = ??? +val connOpts = JdbcConnectionOptions("username", Some("password"), jdbcUrl, driverClass) + +val shardedReadOptions = JdbcShardedReadOptions[(String, Long), Long]( + connectionOptions = connOpts, + tableName = "tableName", + shardColumn = "word_count", + shard = Shard.range[Long], + rowMapper = r => (r.getString("word"), r.getLong("word_count")) +) +val elements: SCollection[(String, Long)] = sc.jdbcShardedSelect(shardedReadOptions) +``` + +## Writes + +Write to JDBC with @scaladoc[saveAsJdbc](com.spotify.scio.jdbc.syntax.JdbcSCollectionOps#saveAsJdbc(connectionOptions:com.spotify.scio.jdbc.JdbcConnectionOptions,statement:String,batchSize:Long,retryConfiguration:org.apache.beam.sdk.io.jdbc.JdbcIO.RetryConfiguration,retryStrategy:java.sql.SQLException=%3EBoolean,autoSharding:Boolean,dataSourceProviderFn:()=%3Ejavax.sql.DataSource,configOverride:org.apache.beam.sdk.io.jdbc.JdbcIO.Write[T]=%3Eorg.apache.beam.sdk.io.jdbc.JdbcIO.Write[T])(preparedStatementSetter:(T,java.sql.PreparedStatement)=%3EUnit):com.spotify.scio.io.ClosedTap[Nothing]). +It expects a @scaladoc[JdbcConnectionOptions](com.spotify.scio.jdbc.JdbcConnectionOptions) to connect to the database. +The curried `preparedStatementSetter` function argument receives an instance of the type-to-be-written and a `PreparedStatement` and appropriately sets the statement fields. + +```scala mdoc:compile-only +import com.spotify.scio._ +import com.spotify.scio.values.SCollection +import com.spotify.scio.jdbc._ +import java.sql.Driver + +val jdbcUrl: String = ??? +val driverClass: Class[Driver] = ??? +val jdbcOptions = JdbcConnectionOptions("username", Some("password"), jdbcUrl, driverClass) +val statement = "INSERT INTO word_count (word, count) values (?, ?)" + +val elements: SCollection[(String, Long)] = ??? +elements.saveAsJdbc(jdbcOptions, statement) { case ((word, count), statement) => + statement.setString(1, word) + statement.setLong(2, count) +} +``` diff --git a/site/src/main/paradox/io/Json.md b/site/src/main/paradox/io/Json.md new file mode 100644 index 0000000000..0357df343a --- /dev/null +++ b/site/src/main/paradox/io/Json.md @@ -0,0 +1,36 @@ +# Json + +Scio supports reading and writing type-safe Json to a case class via [circe](https://circe.github.io/circe/). +Scio must be able to derive @scaladoc[Encoder](com.spotify.scio.extra.json#Encoder[T]=io.circe.Encoder[T]) and @scaladoc[Decoder](com.spotify.scio.extra.json#Decoder[T]=io.circe.Decoder[T]) instances for the record type. + +If you need support for custom encoders or decoders, see the [circe documentation](https://circe.github.io/circe/codecs/custom-codecs.html) + +## Reading Json + +Read Json into a record type with @scaladoc[jsonFile](com.spotify.scio.extra.json.JsonScioContext#jsonFile[T](path:String,compression:org.apache.beam.sdk.io.Compression)(implicitevidence$1:com.spotify.scio.extra.json.package.Decoder[T],implicitevidence$2:com.spotify.scio.coders.Coder[T]):com.spotify.scio.values.SCollection[T]): + +```scala mdoc:compile-only +import com.spotify.scio._ +import com.spotify.scio.values.SCollection +import com.spotify.scio.extra.json._ + +case class Record(i: Int, d: Double, s: String) + +val sc: ScioContext = ??? +val records: SCollection[Record] = sc.jsonFile[Record]("input.json") +``` + +## Writing Json + +Write to Json with @scaladoc[saveAsJsonFile](com.spotify.scio.extra.json.JsonSCollection#saveAsJsonFile(path:String,suffix:String,numShards:Int,compression:org.apache.beam.sdk.io.Compression,printer:io.circe.Printer,shardNameTemplate:String,tempDirectory:String,filenamePolicySupplier:com.spotify.scio.util.FilenamePolicySupplier):com.spotify.scio.io.ClosedTap[T]), which optionally takes a custom `printer` argument of type [`io.circe.Printer`](https://circe.github.io/circe/api/io/circe/Printer.html) for controlling formatting. + +```scala mdoc:compile-only +import com.spotify.scio._ +import com.spotify.scio.extra.json._ +import com.spotify.scio.values.SCollection + +case class Record(i: Int, d: Double, s: String) + +val elements: SCollection[Record] = ??? +elements.saveAsJsonFile("gs://") +``` \ No newline at end of file diff --git a/site/src/main/paradox/io/Neo4J.md b/site/src/main/paradox/io/Neo4J.md new file mode 100644 index 0000000000..8a932965fc --- /dev/null +++ b/site/src/main/paradox/io/Neo4J.md @@ -0,0 +1,83 @@ +# Neo4J + +Scio provides support [Neo4J](https://neo4j.com/) in the `scio-neo4j` artifact. + +Scio uses [magnolify's](https://github.com/spotify/magnolify) `magnolify-neo4j` to convert to and from Neo4J types. + +## Static query + +@scaladoc[neo4jCypher](com.spotify.scio.neo4j.syntax.Neo4jScioContextOps#neo4jCypher[T](neo4jOptions:com.spotify.scio.neo4j.Neo4jOptions,cypher:String)(implicitevidence$1:magnolify.neo4j.ValueType[T],implicitevidence$2:com.spotify.scio.coders.Coder[T]):com.spotify.scio.values.SCollection[T]) returns an `SCollection` of results for a Neo4J cypher query, mapped to a specified case class type. + +```scala mdoc:compile-only +import com.spotify.scio._ +import com.spotify.scio.values.SCollection +import com.spotify.scio.neo4j._ + +case class Entity(id: String, property: Option[String]) + +val sc: ScioContext = ??? +val opts = Neo4jOptions(Neo4jConnectionOptions("neo4j://neo4j.com:7687", "username", "password")) +val query = + """MATCH (e:Entity) + |WHERE e.property = 'value' + |RETURN e""".stripMargin +val entities: SCollection[Entity] = sc + .neo4jCypher[Entity](opts, query) +``` + +## Parameterized query + +@scaladoc[neo4jCypher](com.spotify.scio.neo4j.syntax.Neo4jSCollectionOps#neo4jCypher[U](neo4jConf:com.spotify.scio.neo4j.Neo4jOptions,cypher:String)(implicitneo4jInType:magnolify.neo4j.ValueType[T],implicitneo4jOutType:magnolify.neo4j.ValueType[U],implicitcoder:com.spotify.scio.coders.Coder[U]):com.spotify.scio.values.SCollection[U]) can also construct queries from parameters in an existing `SCollection`: + +```scala mdoc:compile-only +import com.spotify.scio._ +import com.spotify.scio.values.SCollection +import com.spotify.scio.neo4j._ + +case class MovieParam(year: Int) +case class Person(name: String) +case class Movie(title: String, year: Int) +case class Role(person: Person, movie: Movie, role: String) + +val sc: ScioContext = ??? +val input: SCollection[MovieParam] = sc.parallelize( + List( + MovieParam(1994), + MovieParam(0), + MovieParam(1995) + ) +) + +val opts = Neo4jOptions(Neo4jConnectionOptions("neo4j://neo4j.com:7687", "username", "password")) + +val queryRoles = + """MATCH (p)-[r: ACTED_IN]->(m) + |WHERE m.year = $year + |RETURN p as person, m as movie, r.role as role + |""".stripMargin + +input.neo4jCypher[Role](opts, queryRoles) +``` + +## Writes + +Instances can be written via @scaladoc[saveAsNeo4j](com.spotify.scio.neo4j.syntax.Neo4jSCollectionOps#saveAsNeo4j(neo4jOptions:com.spotify.scio.neo4j.Neo4jOptions,unwindCypher:String,batchSize:Long)(implicitneo4jType:magnolify.neo4j.ValueType[T],implicitcoder:com.spotify.scio.coders.Coder[T]):com.spotify.scio.io.ClosedTap[Nothing]): + +```scala mdoc:compile-only +import com.spotify.scio._ +import com.spotify.scio.values.SCollection +import com.spotify.scio.neo4j._ + +case class Entity(id: String, property: Option[String]) + +val sc: ScioContext = ??? +val input: SCollection[Entity] = ??? + +val opts = Neo4jOptions(Neo4jConnectionOptions("neo4j://neo4j.com:7687", "username", "password")) +val unwindCypher = + """UNWIND $rows AS row + |MERGE (e:Entity {id: row.id}) + |ON CREATE SET p.id = row.id, p.property = row.property + |""".stripMargin +input.saveAsNeo4j(opts, unwindCypher) +``` \ No newline at end of file diff --git a/site/src/main/paradox/io/Object.md b/site/src/main/paradox/io/Object.md new file mode 100644 index 0000000000..becbe14348 --- /dev/null +++ b/site/src/main/paradox/io/Object.md @@ -0,0 +1,36 @@ +# Object file + +"Object files" can be used to save an `SCollection` of records with an arbitrary type by using Beam's coder infrastructure. +Each record is encoded to a byte array by the available Beam coder, the bytes are then wrapped in a simple Avro record containing a single byte field, then saved to disk. + +Object files are convenient for ad-hoc work, but it should be preferred to use a real schema-backed format when possible. + +## Reading object files + +Object files can be read via @scaladoc[objectFile](com.spotify.scio.avro.syntax.ScioContextOps#objectFile[T](path:String)(implicitevidence$1:com.spotify.scio.coders.Coder[T]):com.spotify.scio.values.SCollection[T]): + +```scala mdoc:compile-only +import com.spotify.scio._ +import com.spotify.scio.avro._ +import com.spotify.scio.values.SCollection + +case class A(i: Int, s: String) + +val sc: ScioContext = ??? +val elements: SCollection[A] = sc.objectFile("gs:///*.obj.avro") +``` + +## Writing object files + +Object files can be written via @scaladoc[saveAsObjectFile](com.spotify.scio.avro.syntax.ObjectFileSCollectionOps#saveAsObjectFile(path:String,numShards:Int,suffix:String,codec:org.apache.avro.file.CodecFactory,metadata:Map[String,AnyRef],shardNameTemplate:String,tempDirectory:String,filenamePolicySupplier:com.spotify.scio.util.FilenamePolicySupplier)(implicitcoder:com.spotify.scio.coders.Coder[T]):com.spotify.scio.io.ClosedTap[T]): + +```scala mdoc:compile-only +import com.spotify.scio._ +import com.spotify.scio.avro._ +import com.spotify.scio.values.SCollection + +case class A(i: Int, s: String) + +val elements: SCollection[A] = ??? +elements.saveAsObjectFile("gs://") +``` diff --git a/site/src/main/paradox/io/Parquet.md b/site/src/main/paradox/io/Parquet.md index 009065e8f2..7481d6d74b 100644 --- a/site/src/main/paradox/io/Parquet.md +++ b/site/src/main/paradox/io/Parquet.md @@ -118,13 +118,13 @@ If your Avro schema contains a logical type, you'll need to supply an additional If you're using the default version of Avro (1.8), you can use Scio's pre-built logical type conversions: -```scala mdoc:reset +```scala mdoc:compile-only import com.spotify.scio._ import com.spotify.scio.values.SCollection import com.spotify.scio.parquet.avro._ import com.spotify.scio.avro.TestRecord -val sc: ScioContext = ScioContext() +val sc: ScioContext = ??? val data: SCollection[TestRecord] = sc.parallelize(List[TestRecord]()) // Reads @@ -242,7 +242,7 @@ Add the following import to handle typed Parquet in a way compatible with Parque import magnolify.parquet.ParquetArray.AvroCompat._ ``` -The same Avro schema evolution principles apply to Parquet, i.e. only append `OPTIONAL` or `REPEATED` fields with default `null` or `[]`. See this [test](https://github.com/spotify/magnolify/blob/master/parquet/src/test/scala/magnolify/parquet/test/SchemaEvolutionSuite.scala) for some common scenarios w.r.t. Parquet schema evolution. +The same Avro schema evolution principles apply to Parquet, i.e. only append `OPTIONAL` or `REPEATED` fields with default `null` or `[]`. See this [test](https://github.com/spotify/magnolify/blob/main/parquet/src/test/scala/magnolify/parquet/SchemaEvolutionSuite.scala) for some common scenarios w.r.t. Parquet schema evolution. ## Configuring Parquet @@ -295,4 +295,4 @@ A full list of Parquet configuration options can be found [here](https://github. Parquet read internals have been reworked in Scio 0.12.0. As of 0.12.0, you can opt-into the new Parquet read implementation, backed by the new Beam [SplittableDoFn](https://beam.apache.org/blog/splittable-do-fn/) API, by following the instructions -@ref:[here](../migrations/v0.12.0-Migration-Guide.md#parquet-reads). +@ref:[here](../releases/migrations/v0.12.0-Migration-Guide.md#parquet-reads). diff --git a/site/src/main/paradox/io/Pubsub.md b/site/src/main/paradox/io/Pubsub.md index 9ce784af3f..9f0b0f851d 100644 --- a/site/src/main/paradox/io/Pubsub.md +++ b/site/src/main/paradox/io/Pubsub.md @@ -6,7 +6,7 @@ Scio supports [Google Cloud PubSub](https://cloud.google.com/pubsub/docs/overvie Use the appropriate `PubsubIO` method with `ScioContext.read` to read into strings, avro, protobuf, beam's `PubsubMessage`, or into any type supported by a scio `Coder`. Pass a `PubsubIO.ReadParam` to configure whether reading from a topic or subscription. -```scala +```scala mdoc:compile-only import com.spotify.scio._ import com.spotify.scio.values._ import com.spotify.scio.pubsub._ @@ -25,7 +25,7 @@ val d: PubsubIO[PubsubMessage] = PubsubIO.pubsub[PubsubMessage]("messages") case class MyClass(s: String, i: Int) val e: PubsubIO[MyClass] = PubsubIO.coder[MyClass]("myclasses") -val sc = ScioContext() +val sc: ScioContext = ??? // read strings from a subscription val in1: SCollection[String] = sc.read(a)(PubsubIO.ReadParam(PubsubIO.Subscription)) @@ -36,17 +36,17 @@ val in2: SCollection[String] = sc.read(a)(PubsubIO.ReadParam(PubsubIO.Topic)) The `withAttributes` methods give access to the PubSub attributes within the SCollection: -```scala +```scala mdoc:compile-only import com.spotify.scio._ import com.spotify.scio.values._ import com.spotify.scio.pubsub._ -val sc = ScioContext() +val sc: ScioContext = ??? val in: SCollection[(String, Map[String, String])] = sc.read(PubsubIO.withAttributes[String]("strings"))(PubsubIO.ReadParam(PubsubIO.Subscription)) .map { case (element, attributes) => attributes.get("name") - // ... + ??? } ``` @@ -54,7 +54,7 @@ val in: SCollection[(String, Map[String, String])] = PubSub write methods use the same PubSubIO methods as reading: -```scala +```scala mdoc:compile-only import com.spotify.scio._ import com.spotify.scio.values._ import com.spotify.scio.pubsub._ @@ -81,7 +81,7 @@ myClasses.write(PubsubIO.coder[MyClass]("myClasses"))(PubsubIO.WriteParam()) Writing attributes: -```scala +```scala mdoc:compile-only import com.spotify.scio._ import com.spotify.scio.values._ import com.spotify.scio.pubsub._ diff --git a/site/src/main/paradox/io/ReadFiles.md b/site/src/main/paradox/io/ReadFiles.md new file mode 100644 index 0000000000..e82d85f682 --- /dev/null +++ b/site/src/main/paradox/io/ReadFiles.md @@ -0,0 +1,77 @@ +# ReadFiles + +Scio supports reading file paths from an `SCollection[String]` into various formats. + +## Read as text lines + +Reading to `String` text lines via @scaladoc[readFiles](com.spotify.scio.values.SCollection#readFiles(implicitev:T%3C:%3CString):com.spotify.scio.values.SCollection[String]): + +```scala mdoc:compile-only +import com.spotify.scio.ScioContext +import com.spotify.scio.values.SCollection + +val sc: ScioContext = ??? +val paths: SCollection[String] = ??? +val fileBytes: SCollection[String] = paths.readFiles +``` + +## Read entire file as String + +Reading to `String` text lines via @scaladoc[readFilesAsString](com.spotify.scio.values.SCollection#readFilesAsString(implicitev:T%3C:%3CString):com.spotify.scio.values.SCollection[String]): + +```scala mdoc:compile-only +import com.spotify.scio.ScioContext +import com.spotify.scio.values.SCollection + +val sc: ScioContext = ??? +val paths: SCollection[String] = ??? +val fileBytes: SCollection[String] = paths.readFiles +``` + +## Read as binary + +Reading to binary `Array[Byte]` via @scaladoc[readFilesAsBytes](com.spotify.scio.values.SCollection#readFilesAsBytes(implicitev:T%3C:%3CString):com.spotify.scio.values.SCollection[Array[Byte]]): + +```scala mdoc:compile-only +import com.spotify.scio.ScioContext +import com.spotify.scio.values.SCollection + +val sc: ScioContext = ??? +val paths: SCollection[String] = ??? +val fileBytes: SCollection[Array[Byte]] = paths.readFilesAsBytes +``` + +## Read as a custom type + +Reading to a custom type with a user-defined function from `FileIO.ReadableFile` to the output type via @scaladoc[readFiles](com.spotify.scio.values.SCollection#readFiles[A](f:org.apache.beam.sdk.io.FileIO.ReadableFile=%3EA)(implicitevidence$24:com.spotify.scio.coders.Coder[A],implicitev:T%3C:%3CString):com.spotify.scio.values.SCollection[A]): + +```scala mdoc:compile-only +import com.spotify.scio.ScioContext +import com.spotify.scio.values.SCollection +import org.apache.beam.sdk.{io => beam} + +case class A(i: Int, s: String) +val sc: ScioContext = ??? +val paths: SCollection[String] = ??? +val userFn: beam.FileIO.ReadableFile => A = ??? +val fileBytes: SCollection[A] = paths.readFiles(userFn) +``` + +## Read with a Beam transform + +If there is an existing beam `PTransform` from `FileIO.ReadableFile` to `A` (as an example, beam's `TextIO.readFiles()`), this can be reused via another variant of @scaladoc[readFiles](com.spotify.scio.values.SCollection#readFiles[A](filesTransform:org.apache.beam.sdk.transforms.PTransform[org.apache.beam.sdk.values.PCollection[org.apache.beam.sdk.io.FileIO.ReadableFile],org.apache.beam.sdk.values.PCollection[A]],directoryTreatment:org.apache.beam.sdk.io.FileIO.ReadMatches.DirectoryTreatment,compression:org.apache.beam.sdk.io.Compression)(implicitevidence$26:com.spotify.scio.coders.Coder[A],implicitev:T%3C:%3CString):com.spotify.scio.values.SCollection[A]) + +```scala mdoc:compile-only +import com.spotify.scio.ScioContext +import com.spotify.scio.values.SCollection +import org.apache.beam.sdk.{io => beam} +import org.apache.beam.sdk.transforms.PTransform +import org.apache.beam.sdk.values.PCollection + +case class A(i: Int, s: String) + +val sc: ScioContext = ??? +val paths: SCollection[String] = ??? +val userTransform: PTransform[PCollection[beam.FileIO.ReadableFile], PCollection[A]] = ??? +val fileBytes: SCollection[A] = paths.readFiles(userTransform) +``` \ No newline at end of file diff --git a/site/src/main/paradox/io/Redis.md b/site/src/main/paradox/io/Redis.md new file mode 100644 index 0000000000..75d8430453 --- /dev/null +++ b/site/src/main/paradox/io/Redis.md @@ -0,0 +1,42 @@ +# Redis + +Scio provides support for [Redis](https://redis.io/) in the `scio-redis` artifact. + +# Batch read + +Reading key-value pairs from redis for a specific key pattern is supported via @scaladoc[redis](com.spotify.scio.redis.syntax.ScioContextOps#redis(connectionOptions:com.spotify.scio.redis.RedisConnectionOptions,keyPattern:String,batchSize:Int,outputParallelization:Boolean):com.spotify.scio.values.SCollection[(String,String)]): + +```scala mdoc:compile-only +import com.spotify.scio._ +import com.spotify.scio.values.SCollection +import com.spotify.scio.redis._ + +val sc: ScioContext = ??? +val connectionOptions = RedisConnectionOptions("redisHost", 6379) +val keyPattern = "foo*" + +val elements: SCollection[(String, String)] = sc.redis(connectionOptions, keyPattern) +``` + +# Lookups + +Looking up specific keys from redis can be done with @scaladoc[RedisDoFn](com.spotify.scio.redis.RedisDoFn): + +@@snip [RedisExamples.scala](/scio-examples/src/main/scala/com/spotify/scio/examples/extra/RedisExamples.scala) { #RedisLookup_example } + +# Write + +Writes to Redis require an `SCollection` of a subclass of @scaladoc[RedisMutation](com.spotify.scio.redis.types.RedisMutation). +Writes work in both batch and streaming modes via @scaladoc[saveAsRedis](com.spotify.scio.redis.syntax.SCollectionRedisOps#saveAsRedis(connectionOptions:com.spotify.scio.redis.RedisConnectionOptions,batchSize:Int):com.spotify.scio.io.ClosedTap[Nothing]): + +```scala mdoc:compile-only +import com.spotify.scio._ +import com.spotify.scio.values.SCollection +import com.spotify.scio.redis._ +import com.spotify.scio.redis.types._ + +val connectionOptions = RedisConnectionOptions("redisHost", 6379) + +val keys: SCollection[String] = ??? +keys.map(IncrBy(_, 1)).saveAsRedis(connectionOptions) +``` diff --git a/site/src/main/paradox/io/Spanner.md b/site/src/main/paradox/io/Spanner.md new file mode 100644 index 0000000000..5eebcf9836 --- /dev/null +++ b/site/src/main/paradox/io/Spanner.md @@ -0,0 +1,66 @@ +# Spanner + +Scio supports reading and writing from [Google Cloud Spanner](https://cloud.google.com/spanner). + +## Read from Spanner + +Reads from Spanner occur via a query with @scaladoc[spannerQuery](com.spotify.scio.spanner.syntax.SpannerScioContextOps#spannerQuery(spannerConfig:org.apache.beam.sdk.io.gcp.spanner.SpannerConfig,query:String,withBatching:Boolean,withTransaction:Boolean):com.spotify.scio.values.SCollection[com.google.cloud.spanner.Struct]) or for an entire table with @scaladoc[spannerTable](com.spotify.scio.spanner.syntax.SpannerScioContextOps#spannerTable(spannerConfig:org.apache.beam.sdk.io.gcp.spanner.SpannerConfig,table:String,columns:Seq[String],withBatching:Boolean,withTransaction:Boolean):com.spotify.scio.values.SCollection[com.google.cloud.spanner.Struct]). Both return an `SCollection` of [`Struct`](https://www.javadoc.io/doc/com.google.cloud/google-cloud-spanner/6.38.0/com/google/cloud/spanner/Struct.html): + +To read with a query: + +```scala mdoc:compile-only +import com.spotify.scio._ +import com.spotify.scio.spanner._ +import com.spotify.scio.values.SCollection +import org.apache.beam.sdk.io.gcp.spanner.SpannerConfig +import com.google.cloud.spanner.Struct + +val config: SpannerConfig = SpannerConfig + .create() + .withProjectId("someProject") + .withDatabaseId("someDatabase") + .withInstanceId("someInstance") + +val sc: ScioContext = ??? +val queryStructs: SCollection[Struct] = sc.spannerQuery(config, "SELECT a, b FROM table WHERE c > 5") +``` + +To read an entire table: + +```scala mdoc:compile-only +import com.spotify.scio._ +import com.spotify.scio.spanner._ +import com.spotify.scio.values.SCollection +import org.apache.beam.sdk.io.gcp.spanner.SpannerConfig +import com.google.cloud.spanner.Struct + +val config: SpannerConfig = SpannerConfig + .create() + .withProjectId("someProject") + .withDatabaseId("someDatabase") + .withInstanceId("someInstance") + +val sc: ScioContext = ??? +val tableStructs: SCollection[Struct] = sc.spannerTable(config, "table", columns=List("a", "b")) +``` + +## Write to Spanner + +An `SCollection` containing [`Mutation`](https://javadoc.io/static/com.google.cloud/google-cloud-spanner/6.36.0/com/google/cloud/spanner/Mutation.html#com.google.cloud.spanner.Mutation) instances can be written to Spanner via @scaladoc[saveAsSpanner](com.spotify.scio.spanner.syntax.SpannerSCollectionOps#saveAsSpanner(spannerConfig:org.apache.beam.sdk.io.gcp.spanner.SpannerConfig,failureMode:org.apache.beam.sdk.io.gcp.spanner.SpannerIO.FailureMode,batchSizeBytes:Long):com.spotify.scio.io.ClosedTap[Nothing]): + +```scala mdoc:compile-only +import com.spotify.scio.spanner._ +import com.spotify.scio.values.SCollection +import org.apache.beam.sdk.io.gcp.spanner.SpannerConfig +import com.google.cloud.spanner.Mutation + +val config: SpannerConfig = SpannerConfig + .create() + .withProjectId("someProject") + .withDatabaseId("someDatabase") + .withInstanceId("someInstance") + +val mutations: SCollection[Mutation] = ??? +mutations.saveAsSpanner(config) +``` + diff --git a/site/src/main/paradox/io/Tensorflow.md b/site/src/main/paradox/io/Tensorflow.md new file mode 100644 index 0000000000..62c528cf9c --- /dev/null +++ b/site/src/main/paradox/io/Tensorflow.md @@ -0,0 +1,93 @@ +# Tensorflow + +Scio supports several methods of reading and writing [Tensorflow](https://www.tensorflow.org/) records. + +## Reading + +Depending on your input format, and if you need to provide a schema or not, there are various ways to read Tensorflow files. + +@scaladoc[tfRecordFile](com.spotify.scio.tensorflow.syntax.ScioContextOps#tfRecordFile(path:String,compression:org.apache.beam.sdk.io.Compression):com.spotify.scio.values.SCollection[Array[Byte]]) reads entire `TFRecord` files into byte array elements in the pipeline, @scaladoc[tfRecordExampleFile](com.spotify.scio.tensorflow.syntax.ScioContextOps#tfRecordExampleFile(path:String,compression:org.apache.beam.sdk.io.Compression):com.spotify.scio.values.SCollection[org.tensorflow.proto.example.Example]) (or @scaladoc[tfRecordExampleFileWithSchema](com.spotify.scio.tensorflow.syntax.ScioContextOps#tfRecordExampleFileWithSchema(path:String,schemaFilename:String,compression:org.apache.beam.sdk.io.Compression):(com.spotify.scio.values.SCollection[org.tensorflow.proto.example.Example],com.spotify.scio.values.DistCache[org.tensorflow.metadata.v0.Schema]))) will read @javadoc[Example](org.tensorflow.proto.example.Example) instances, and @scaladoc[tfRecordSequenceExampleFile](com.spotify.scio.tensorflow.syntax.ScioContextOps#tfRecordSequenceExampleFile(path:String,compression:org.apache.beam.sdk.io.Compression):com.spotify.scio.values.SCollection[org.tensorflow.proto.example.SequenceExample]) (or @scaladoc[tfRecordSequenceExampleFileWithSchema](com.spotify.scio.tensorflow.syntax.ScioContextOps#tfRecordSequenceExampleFileWithSchema(path:String,schemaFilename:String,compression:org.apache.beam.sdk.io.Compression):(com.spotify.scio.values.SCollection[org.tensorflow.proto.example.SequenceExample],com.spotify.scio.values.DistCache[org.tensorflow.metadata.v0.Schema]))) will read @javadoc[SequenceExample](org.tensorflow.proto.example.SequenceExample) instances: + +```scala mdoc:compile-only +import com.spotify.scio.ScioContext +import com.spotify.scio.values.SCollection +import com.spotify.scio.tensorflow._ +import org.tensorflow.proto.example.{Example, SequenceExample} + +val sc: ScioContext = ??? +val recordBytes: SCollection[Array[Byte]] = sc.tfRecordFile("gs://input-record-path") +val examples: SCollection[Example] = sc.tfRecordExampleFile("gs://input-example-path") +val sequenceExamples: SCollection[SequenceExample] = sc.tfRecordSequenceExampleFile("gs://input-sequence-example-path") +``` + +## Writing + +Similar to reading, there are multiple ways to write Tensorflow files, depending on the format of the elements to be output. +Each of these write methods is called `saveAsTfRecordFile`, but only one variant of the method is available based on the element type. + +* For `SCollection[T]` where `T` is a subclass of `Example`: @scaladoc[saveAsTfRecordFile](com.spotify.scio.tensorflow.syntax.ExampleSCollectionOps#saveAsTfRecordFile(path:String,suffix:String,compression:org.apache.beam.sdk.io.Compression,numShards:Int,shardNameTemplate:String,tempDirectory:String,filenamePolicySupplier:com.spotify.scio.util.FilenamePolicySupplier):com.spotify.scio.io.ClosedTap[org.tensorflow.proto.example.Example]) +* For `SCollection[Seq[T]]` where `T` is a subclass of `Example`: @scaladoc[saveAsTfRecordFile](com.spotify.scio.tensorflow.syntax.SeqExampleSCollectionOps#saveAsTfRecordFile(path:String,suffix:String,compression:org.apache.beam.sdk.io.Compression,numShards:Int,shardNameTemplate:String,tempDirectory:String,filenamePolicySupplier:com.spotify.scio.util.FilenamePolicySupplier):com.spotify.scio.io.ClosedTap[org.tensorflow.proto.example.Example]) +* For `SCollection[T]` where `T` is a subclass of `SequenceExample`: @scaladoc[saveAsTfRecordFile](com.spotify.scio.tensorflow.syntax.SequenceExampleSCollectionOps#saveAsTfRecordFile(path:String,suffix:String,compression:org.apache.beam.sdk.io.Compression,numShards:Int,shardNameTemplate:String,tempDirectory:String,filenamePolicySupplier:com.spotify.scio.util.FilenamePolicySupplier):com.spotify.scio.io.ClosedTap[org.tensorflow.proto.example.SequenceExample]) +* For `SCollection[Array[Byte]]`, where it is recommended that the bytes are a serialized `Example`: +@scaladoc[saveAsTfRecordFile](com.spotify.scio.tensorflow.syntax.TFRecordSCollectionOps#saveAsTfRecordFile(path:String,suffix:String,compression:org.apache.beam.sdk.io.Compression,numShards:Int,shardNameTemplate:String,tempDirectory:String,filenamePolicySupplier:com.spotify.scio.util.FilenamePolicySupplier)(implicitev:T%3C:%3CArray[Byte]):com.spotify.scio.io.ClosedTap[Array[Byte]]) + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection +import com.spotify.scio.tensorflow._ +import org.tensorflow.proto.example.{Example, SequenceExample} + +val recordBytes: SCollection[Array[Byte]] = ??? +val examples: SCollection[Example] = ??? +val seqExamples: SCollection[Seq[Example]] = ??? +val sequenceExamples: SCollection[SequenceExample] = ??? + +recordBytes.saveAsTfRecordFile("gs://output-record-path") +examples.saveAsTfRecordFile("gs://output-example-path") +seqExamples.saveAsTfRecordFile("gs://output-seq-example-path") +sequenceExamples.saveAsTfRecordFile("gs://output-sequence-example-path") +``` + +# Prediction/inference + +Scio supports preforming inference on a saved Tensorflow model. + +For an `SCollection` of an arbitrary user type, predictions can be made against the raw model via @scaladoc[predict](com.spotify.scio.tensorflow.syntax.PredictSCollectionOps#predict[V,W](savedModelUri:String,fetchOps:Seq[String],options:com.spotify.zoltar.tf.TensorFlowModel.Options,signatureName:String)(inFn:T=%3EMap[String,org.tensorflow.Tensor])(outFn:(T,Map[String,org.tensorflow.Tensor])=%3EV)(implicitevidence$1:com.spotify.scio.coders.Coder[V]):com.spotify.scio.values.SCollection[V]) or using the model's [SignatureDefs](https://www.tensorflow.org/tfx/serving/signature_defs) with @scaladoc[predictWithSigDef](com.spotify.scio.tensorflow.syntax.PredictSCollectionOps#predictWithSigDef[V,W](savedModelUri:String,options:com.spotify.zoltar.tf.TensorFlowModel.Options,fetchOps:Option[Seq[String]],signatureName:String)(inFn:T=%3EMap[String,org.tensorflow.Tensor])(outFn:(T,Map[String,org.tensorflow.Tensor])=%3EV)(implicitevidence$2:com.spotify.scio.coders.Coder[V]):com.spotify.scio.values.SCollection[V]): + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection +import com.spotify.scio.tensorflow._ +import com.spotify.zoltar.tf.TensorFlowModel +import org.tensorflow._ +import org.tensorflow.proto.example.Example + +case class A() +case class B() + +def toTensors(a: A): Map[String, Tensor] = ??? +def fromTensors(a: A, tensors: Map[String, Tensor]): B = ??? + +val elements: SCollection[A] = ??? +val options: TensorFlowModel.Options = ??? +val fetchOpts: Seq[String] = ??? + +val result: SCollection[B] = elements.predict[B]("gs://model-path", fetchOpts, options)(toTensors)(fromTensors) +val b: SCollection[B] = elements.predictWithSigDef[B]("gs://model-path", options)(toTensors)(fromTensors _) +``` + +For an `SCollection` of some subclass of `Example`, a prediction can be made via @scaladoc[predictTfExamples](com.spotify.scio.tensorflow.syntax.PredictSCollectionOps#predictTfExamples[V](savedModelUri:String,options:com.spotify.zoltar.tf.TensorFlowModel.Options,exampleInputOp:String,fetchOps:Option[Seq[String]],signatureName:String)(outFn:(T,Map[String,org.tensorflow.Tensor])=%3EV)(implicitevidence$3:com.spotify.scio.coders.Coder[V],implicitev:T%3C:%3Corg.tensorflow.proto.example.Example):com.spotify.scio.values.SCollection[V]): + +```scala mdoc:compile-only +import com.spotify.scio.values.SCollection +import com.spotify.scio.tensorflow._ +import com.spotify.zoltar.tf.TensorFlowModel +import org.tensorflow._ +import org.tensorflow.proto.example.Example + +val exampleElements: SCollection[Example] = ??? +val options: TensorFlowModel.Options = ??? +def toExample(in: Example, tensors: Map[String, Tensor]): Example = ??? + +val c: SCollection[Example] = exampleElements.predictTfExamples[Example]("gs://model-path", options) { + case (a, tensors) => toExample(a, tensors) +} +``` diff --git a/site/src/main/paradox/io/Text.md b/site/src/main/paradox/io/Text.md new file mode 100644 index 0000000000..0a0027c322 --- /dev/null +++ b/site/src/main/paradox/io/Text.md @@ -0,0 +1,32 @@ +# Text + +## Reading text + +Scio reads newline-delimited text via @scaladoc[textFile](com.spotify.scio.ScioContext#textFile(path:String,compression:org.apache.beam.sdk.io.Compression):com.spotify.scio.values.SCollection[String]): + +```scala mdoc:compile-only +import com.spotify.scio._ +import com.spotify.scio.avro._ +import com.spotify.scio.values.SCollection + +val sc: ScioContext = ??? +val elements: SCollection[String] = sc.textFile("gs:///*.txt") +``` + +## Writing text + +An `SCollection[String]` or `SCollection` of any class implementing `toString` can be written out to a newline-delimited text file via @scaladoc[saveAsTextFile](com.spotify.scio.values.SCollection#saveAsTextFile(path:String,numShards:Int,suffix:String,compression:org.apache.beam.sdk.io.Compression,header:Option[String],footer:Option[String],shardNameTemplate:String,tempDirectory:String,filenamePolicySupplier:com.spotify.scio.util.FilenamePolicySupplier)(implicitct:scala.reflect.ClassTag[T]):com.spotify.scio.io.ClosedTap[String]). +An optional `header` and `footer` parameter can be provided. + +```scala mdoc:compile-only +import com.spotify.scio._ +import com.spotify.scio.avro._ +import com.spotify.scio.values.SCollection + +val elements: SCollection[String] = ??? +elements.saveAsTextFile( + "gs://", + header=Some("header"), + footer=Some("footer") +) +``` diff --git a/site/src/main/paradox/io/index.md b/site/src/main/paradox/io/index.md index 52bb6e99b7..72a47d427a 100644 --- a/site/src/main/paradox/io/index.md +++ b/site/src/main/paradox/io/index.md @@ -5,10 +5,25 @@ @@@ index * @ref:[Avro](Avro.md) +* @ref:[Binary](Binary.md) * @ref:[BigQuery](BigQuery.md) * @ref:[Bigtable](Bigtable.md) +* @ref:[Cassandra](Cassandra.md) +* @ref:[CSV](Csv.md) +* @ref:[Datastore](Datastore.md) +* @ref:[Grpc](Grpc.md) +* @ref:[Elasticsearch](Elasticsearch.md) +* @ref:[JDBC](Jdbc.md) +* @ref:[Json](Json.md) +* @ref:[Neo4J](Neo4J.md) +* @ref:[Object](Object.md) * @ref:[Parquet](Parquet.md) * @ref:[Protobuf](Protobuf.md) * @ref:[Pubsub](Pubsub.md) +* @ref:[ReadFiles](ReadFiles.md) +* @ref:[Redis](Redis.md) +* @ref:[Spanner](Spanner.md) +* @ref:[Tensorflow](Tensorflow.md) +* @ref:[Text](Text.md) @@@ diff --git a/site/src/main/paradox/Apache-Beam.md b/site/src/main/paradox/releases/Apache-Beam.md similarity index 74% rename from site/src/main/paradox/Apache-Beam.md rename to site/src/main/paradox/releases/Apache-Beam.md index 31f5cafb34..c6b2fe1f6e 100644 --- a/site/src/main/paradox/Apache-Beam.md +++ b/site/src/main/paradox/releases/Apache-Beam.md @@ -16,6 +16,7 @@ Also check out the [SDK Version Support Status](https://cloud.google.com/dataflo | **Scio** | **SDK Dependency** | **Description** | |:--------:|:------------------:|:------------------------------------------------------------------------------------------------------------------------------------| +| 0.13.x | Apache Beam 2.x.x | scio-elasticsearch6 removed. scio-elasticsearch7 migrated to new client. File based ScioIO param changes. | | 0.12.x | Apache Beam 2.x.x | com.spotify.scio.extra.bigquery, com.spotify.scio.pubsub removed. scio-elasticsearch6 deprecated. | | 0.11.x | Apache Beam 2.x.x | scio-sql and case-app removed, com.spotify.scio.extra.bigquery deprecated, shaded Beam Avro coder, `tensorflow-core-platform` 0.3.3 | | 0.10.x | Apache Beam 2.x.x | Coder implicits, `scio-google-cloud-platform` | @@ -31,6 +32,13 @@ Also check out the [SDK Version Support Status](https://cloud.google.com/dataflo | **Scio Version** | **Beam Version** | **Details** | |:----------------:|:----------------:|:------------------------------------------------------| +| 0.13.3 | 2.50.0 | This version will be deprecated on August 30, 2024. | +| 0.13.2 | 2.49.0 | This version will be deprecated on July 17, 2024. | +| 0.13.1 | 2.49.0 | This version will be deprecated on July 17, 2024. | +| 0.13.0 | 2.48.0 | This version will be deprecated on May 31, 2024. | +| 0.12.8 | 2.46.0 | This version will be deprecated on March 10, 2024. | +| 0.12.7 | 2.46.0 | This version will be deprecated on March 10, 2024. | +| 0.12.6 | 2.46.0 | This version will be deprecated on March 10, 2024. | | 0.12.5 | 2.45.0 | This version will be deprecated on February 15, 2024. | | 0.12.4 | 2.44.0 | This version will be deprecated on January 13, 2024. | | 0.12.3 | 2.44.0 | This version will be deprecated on January 13, 2024. | @@ -42,28 +50,26 @@ Also check out the [SDK Version Support Status](https://cloud.google.com/dataflo | 0.11.12 | 2.41.0 | This version will be deprecated on August 23rd, 2023. | | 0.11.11 | 2.41.0 | This version will be deprecated on August 23rd, 2023. | | 0.11.10 | 2.41.0 | This version will be deprecated on August 23rd, 2023. | -| 0.11.9 | 2.39.0 | This version will be deprecated on May 25, 2023. | -| 0.11.6 | 2.38.0 | This version will be deprecated on April 20, 2023. | +| 0.11.9 | 2.39.0 | Deprecated on May 25, 2023. | +| 0.11.6 | 2.38.0 | Deprecated on April 20, 2023. | | 0.11.5 | 2.36.0 | Deprecated on February 7, 2023. | | 0.11.4 | 2.35.0 | Deprecated on December 29, 2022. | | 0.11.3 | 2.35.0 | Deprecated on December 29, 2022. | | 0.11.2 | 2.34.0 | Deprecated on November 11, 2022. | | 0.11.1 | 2.33.0 | Deprecated on October 7, 2022. | | 0.11.0 | 2.32.0 | Deprecated on August 25, 2022. | -| 0.10.4+ | 2.30.0 | Deprecated on June 10, 2022. | -| 0.10.3 | 2.29.0 | Deprecated on April 29, 2022. | -| 0.10.0+ | 2.28.0 | Deprecated on February 22, 2022. | ## Beam dependencies -Scio's other library dependencies are kept in sync with Beam's to avoid compatibility issues. You can find -Beam's dependency list in its [Groovy config](https://github.com/apache/beam/blob/v2.35.0/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy) (substitute the version tag in the URL with the desired Beam version). Additionally, Beam keeps many of its Google dependencies in sync with a [central BOM](https://storage.googleapis.com/cloud-opensource-java-dashboard/com.google.cloud/libraries-bom/24.0.0/artifact_details.html) (subsitute the version tag in the URL with the value of `google_cloud_platform_libraries_bom` from Beam). Scio users who suspect incompatibility issues in their pipelines (common issues are GRPC, Netty, or Guava) can run `sbt evicted` and `sbt dependencyTree` to ensure their direct and transitive dependencies don't conflict with Scio or Beam. +Scio's other library dependencies are kept in sync with Beam's to avoid compatibility issues. Scio will typically _not_ bump dependency versions beyond what is supported in Beam due to the large test surface and the potential for data loss. + +You can find Beam's dependency list in its [Groovy config](https://github.com/apache/beam/blob/v2.35.0/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy) (substitute the version tag in the URL with the desired Beam version). Additionally, Beam keeps many of its Google dependencies in sync with a [central BOM](https://storage.googleapis.com/cloud-opensource-java-dashboard/com.google.cloud/libraries-bom/24.0.0/artifact_details.html) (substitute the version tag in the URL with the value of `google_cloud_platform_libraries_bom` from Beam). Scio users who suspect incompatibility issues in their pipelines (common issues are GRPC, Netty, or Guava) can run `sbt evicted` and `sbt dependencyTree` to ensure their direct and transitive dependencies don't conflict with Scio or Beam. ## Release cycle and backport procedures Scio has a frequent release cycle, roughly every 2-4 weeks, as compared to months for the upstream Apache Beam. We also aim to stay a step ahead by pulling changes from upstream and contributing new ones back. -Let's call the Beam version that Scio depends on `current`, and upstream master `latest`. Here're the procedures for backporting changes. +Let's call the Beam version that Scio depends on `current`, and upstream master `latest`. Here are the procedures for backporting changes: For changes available in `latest` but not in `current`: - Copy Java files from `latest` to Scio repo diff --git a/site/src/main/paradox/releases/breaking-changes.md b/site/src/main/paradox/releases/breaking-changes.md index 7bee029a55..5498f3e4f6 100644 --- a/site/src/main/paradox/releases/breaking-changes.md +++ b/site/src/main/paradox/releases/breaking-changes.md @@ -1,6 +1,13 @@ # Breaking Changelog -## Breaking changes since 0.12.0 (@ref:[v0.12.0 Migration Guide](../migrations/v0.12.0-Migration-Guide.md)) +## Breaking changes since 0.13.0 +- Removed `scio-elasticsearch6` +- Migrated `scio-elasticsearch7` to new [java client](https://www.elastic.co/guide/en/elasticsearch/client/java-api-client/7.17/introduction.html) +- Changed `skewedJoin` API (scalafix @github[rule](scalafix/rules/src/main/scala/fix/v0_13_0/FixSkewedJoins.scala) provided) +- New File based ScioIO parameters (notably `suffix` in the read params) +- Removal of unused type parameter on tensorflow `predict` and `predictWithSigDef` + +## Breaking changes since 0.12.0 (@ref:[v0.12.0 Migration Guide](migrations/v0.12.0-Migration-Guide.md)) - Removed `com.spotify.scio.extra.bigquery` - Removed `com.spotify.scio.pubsub` specializations - Changed type signatures of SMB methods to accommodate secondary-keyed SMB @@ -14,24 +21,24 @@ ) ``` -## Breaking changes since Scio 0.10.0 (@ref:[v0.10.0 Migration Guide](../migrations/v0.10.0-Migration-Guide.md)) +## Breaking changes since Scio 0.10.0 (@ref:[v0.10.0 Migration Guide](migrations/v0.10.0-Migration-Guide.md)) - Move GCP modules to `scio-google-cloud-platform` - Simplify coder implicits -## Breaking changes since Scio 0.9.0 (@ref:[v0.9.0 Migration Guide](../migrations/v0.9.0-Migration-Guide.md)) +## Breaking changes since Scio 0.9.0 (@ref:[v0.9.0 Migration Guide](migrations/v0.9.0-Migration-Guide.md)) - Drop Scala 2.11, add Scala 2.13 support - Remove deprecated modules `scio-cassandra2` and `scio-elasticsearch2` - Remove deprecated methods since 0.8.0 - Switch from Algebird `Hash128[K]` to Guava `Funnel[K]` for Bloom filter and sparse transforms -## Breaking changes since Scio 0.8.0 (@ref:[v0.8.0 Migration Guide](../migrations/v0.8.0-Migration-Guide.md)) +## Breaking changes since Scio 0.8.0 (@ref:[v0.8.0 Migration Guide](migrations/v0.8.0-Migration-Guide.md)) - `ScioIO`s no longer return `Future` - `ScioContext#close` returns `ScioExecutionContext` instead of `ScioResult` - Async `DoFn` refactor - Deprecate `scio-cassandra2` and `scio-elasticsearch2` - `ContextAndArgs#typed` no longer accepts list-case #2221 -## Breaking changes since Scio 0.7.0 (@ref:[v0.7.0 Migration Guide](../migrations/v0.7.0-Migration-Guide.md)) +## Breaking changes since Scio 0.7.0 (@ref:[v0.7.0 Migration Guide](migrations/v0.7.0-Migration-Guide.md)) - New [Magnolia](https://github.com/softwaremill/magnolia) based @ref:[Coders](../internals/Coders.md) derivation - New @ref:[ScioIO](../internals/ScioIO.md) replaces `TestIO[T]` to simplify IO implementation and stubbing in `JobTest` @@ -48,7 +55,7 @@ ## Breaking changes since Scio 0.4.0 -- Accumulators are replaced by the new metrics API, see @extref[MetricsExample.scala](example:MetricsExample) for more +- Accumulators are replaced by the new metrics API, see @extref[MetricsExample](example:MetricsExample) for more - `com.spotify.scio.hdfs` package and related APIs (`ScioContext#hdfs*`, `SCollection#saveAsHdfs*`) are removed, regular file IO API should now support both GCS and HDFS (if `scio-hdfs` is included as a dependency). - Starting Scio 0.4.4, Beam runner is completely decoupled from `scio-core`. See [[Runners]] page for more details. diff --git a/site/src/main/paradox/releases/index.md b/site/src/main/paradox/releases/index.md index 58fbc1d319..2446b885e3 100644 --- a/site/src/main/paradox/releases/index.md +++ b/site/src/main/paradox/releases/index.md @@ -3,6 +3,8 @@ @@toc { depth=1 } @@@ index +* @ref:[Beam Compatibility Guide](Apache-Beam.md) +* @ref:[Migrations](migrations/index.md) * @ref:[Breaking Changelog](breaking-changes.md) * Release blogs * @ref:[v0.12.0 Release Blog](v0.12.0.md) diff --git a/site/src/main/paradox/migrations/index.md b/site/src/main/paradox/releases/migrations/index.md similarity index 85% rename from site/src/main/paradox/migrations/index.md rename to site/src/main/paradox/releases/migrations/index.md index 56cca5c57d..7713839fd0 100644 --- a/site/src/main/paradox/migrations/index.md +++ b/site/src/main/paradox/releases/migrations/index.md @@ -9,5 +9,6 @@ * @ref:[v0.9.0](v0.9.0-Migration-Guide.md) * @ref:[v0.10.0](v0.10.0-Migration-Guide.md) * @ref:[v0.12.0](v0.12.0-Migration-Guide.md) +* @ref:[v0.13.0](v0.13.0-Migration-Guide.md) @@@ diff --git a/site/src/main/paradox/migrations/v0.10.0-Migration-Guide.md b/site/src/main/paradox/releases/migrations/v0.10.0-Migration-Guide.md similarity index 100% rename from site/src/main/paradox/migrations/v0.10.0-Migration-Guide.md rename to site/src/main/paradox/releases/migrations/v0.10.0-Migration-Guide.md diff --git a/site/src/main/paradox/migrations/v0.12.0-Migration-Guide.md b/site/src/main/paradox/releases/migrations/v0.12.0-Migration-Guide.md similarity index 93% rename from site/src/main/paradox/migrations/v0.12.0-Migration-Guide.md rename to site/src/main/paradox/releases/migrations/v0.12.0-Migration-Guide.md index 1a331a37e6..9a53b44bc5 100644 --- a/site/src/main/paradox/migrations/v0.12.0-Migration-Guide.md +++ b/site/src/main/paradox/releases/migrations/v0.12.0-Migration-Guide.md @@ -12,7 +12,7 @@ For usages of `saveAvroAsBigQuery`, use `saveAsBigQueryTable` from `com.spotify. Note: you can run the following sbt command to run the relevant [scalafix](https://scalacenter.github.io/scalafix/docs/developers/tutorial.html#run-the-rule-from-source-code) rules to update your BQ API usages: ``` -sbt "scalafixEnable; scalafix github:spotify/scio/FixBqSaveAsTable" +sbt "scalafixEnable; scalafix https://raw.githubusercontent.com/spotify/scio/main/scalafix/rules/src/main/scala/fix/v0_12_0/FixBqSaveAsTable.scala" ``` ## Removal of `com.spotify.scio.pubsub` specializations @@ -50,7 +50,7 @@ scoll.write(PubsubIO.withAttributes[String](topic, idAttribute, timestampAttribu Note: you can run the following sbt command to run the relevant [scalafix](https://scalacenter.github.io/scalafix/docs/developers/tutorial.html#run-the-rule-from-source-code) rules to automatically update deprecated Pub/Sub API usages: ``` -sbt "scalafixEnable; scalafix github:spotify/scio/FixPubsubSpecializations" +sbt "scalafixEnable; scalafix https://raw.githubusercontent.com/spotify/scio/main/scalafix/rules/src/main/scala/fix/v0_12_0/FixPubsubSpecializations.scala" ``` ## Changed type signatures of SMB methods @@ -126,13 +126,13 @@ Our plan is to support Legacy Parquet for all Scio 0.12.x versions, but fully de ## Async lookup DoFn All Async lookup DoFn have been reworked and now extends `DoFnWithResource`. -After upgrade you'll get the wollowing error: +After upgrade, you'll get the following error: ``` class MyLookupDoFn needs to be abstract, since method getResourceType in class DoFnWithResource of type ()com.spotify.scio.transforms.DoFnWithResource.ResourceType is not defined ``` -You must now implement the methor and return the appropriate resource type for your client: +You must now implement the method and return the appropriate resource type for your client: - `ResourceType.PER_INSTANCE` if your client is thread safe (this was the previous behavior) - `ResourceType.PER_CLONE` if your client is not thread safe -- `ResourceType.PER_CLASS` if your client is mean to be shared among all instanciations +- `ResourceType.PER_CLASS` if your client is meant to be shared among all instances diff --git a/site/src/main/paradox/releases/migrations/v0.13.0-Migration-Guide.md b/site/src/main/paradox/releases/migrations/v0.13.0-Migration-Guide.md new file mode 100644 index 0000000000..83a1b9926c --- /dev/null +++ b/site/src/main/paradox/releases/migrations/v0.13.0-Migration-Guide.md @@ -0,0 +1,58 @@ +# Scio v0.13.0 + +## gcs-connector now explicitly required + +Previously Scio shipped with `com.google.cloud.bigdataoss:gcs-connector` as part of `scio-parquet`. +This dependency is now removed, so `gcs-connector` must be explicitly enabled if using parquet on GCS: + +``` +val bigdataossVersion = "2.2.6" + +libraryDependencies ++= Seq( + "com.google.cloud.bigdataoss" % "gcs-connector" % s"hadoop2-$bigdataossVersion" +) +``` + +## Removed `scio-elasticsearch6` + +Please migrate to `scio-elasticsearch8`. + +## `scio-elasticsearch7` migrated to java client + +`saveAsElasticsearch` now requires a transform function returning `co.elastic.clients.elasticsearch.core.bulk.BulkOperation` instead of `org.elasticsearch.action.DocWriteRequest`. + +## New File based ScioIO parameters + +File-based IOs now consistently have a `suffix` parameter. +In cases where `ReadParam` was `Unit`, then a new param will be required. +This is the case for example with `AvroIO` and `GenericRecordIO`: + +```diff +- sc.read(GenericRecordIO(path, schema)) ++ sc.read(GenericRecordIO(path, schema))(AvroIO.ReadParam(suffix)) +- sc.read(SpecificRecordIO[T](path)) ++ sc.read(SpecificRecordIO[T](path))(AvroIO.ReadParam(suffix)) +``` + +## Kryo Coders nondeterministic + +Kryo coders in Scio have long been marked as deterministic but users were cautioned to not use them in cases where determinism is important (e.g. with `distinct` or to encode keys in keyed operations) and when the Kryo coders were not explicitly known to be deterministic. +Users who did not understand or follow these instructions could silently produce corrupt data or incomplete results. + +Kryo coders are now marked as nondeterministic in all cases and an exception will be thrown if used in keyed operations. + +## Changed `skewedJoin` API + +Removes some variants of `skewedJoin` APIs with `Long` threshold parameters. +Use the variants with a `HotKeyMethod` parameter instead, providing `HotKeyMethod.Threshold(myThresold)` as its value. + +## Tensorflow unused predict type parameter + +The Tensorflow `predict` and `predictWithSigDef` methods had an unused type parameter that is now removed. + +```diff +- elements.predict[B, D]("gs://model-path", fetchOpts, options)(toTensors)(fromTensors) ++ elements.predict[B]("gs://model-path", fetchOpts, options)(toTensors)(fromTensors) +- elements.predictWithSigDef[B, D]("gs://model-path", options)(toTensors)(fromTensors _) ++ elements.predictWithSigDef[B]("gs://model-path", options)(toTensors)(fromTensors _) +``` diff --git a/site/src/main/paradox/migrations/v0.7.0-Migration-Guide.md b/site/src/main/paradox/releases/migrations/v0.7.0-Migration-Guide.md similarity index 93% rename from site/src/main/paradox/migrations/v0.7.0-Migration-Guide.md rename to site/src/main/paradox/releases/migrations/v0.7.0-Migration-Guide.md index 08c1f499e4..7c7c95442c 100644 --- a/site/src/main/paradox/migrations/v0.7.0-Migration-Guide.md +++ b/site/src/main/paradox/releases/migrations/v0.7.0-Migration-Guide.md @@ -16,13 +16,13 @@ Scio now provides a new class `ScioIO` that you can extend to support new types All existing IOs (GCS, BigQuery, BigTable, etc.) have been rewritten to use the new IO API. -**Read more: @ref:[ScioIO](../internals/ScioIO.md)** +**Read more: @ref:[ScioIO](../../internals/ScioIO.md)** ### New "static" coders Scio `0.7.0` also ship with a new `Coder` implementation that statically resolve the correct `Coder` for a given type **at compile time**. In previous versions, Scio would infer the correct coder implementation at runtime, which could lead to poor performances and occasionally, exceptions at runtime. -**Read more: @ref:[Coders](../internals/Coders.md).** +**Read more: @ref:[Coders](../../internals/Coders.md).** ### Performances improvements & benchmarks @@ -58,7 +58,7 @@ For this to run properly, you code needs to compile. Run the following command in the sbt shell: ``` -> test:scalafix github:spotify/scio/FixAvroIO +> test:scalafix https://raw.githubusercontent.com/spotify/scio/main/scalafix/rules/src/main/scala/fix/v0_7_0/FixAvroIO.scala [info] Running scalafix on 78 Scala sources [success] Total time: 7 s, completed Oct 17, 2018 12:49:31 PM ``` @@ -82,15 +82,15 @@ You can see all the rules @github[here](/scalafix/rules/src/main/scala/fix). In your sbt shell, you can now apply the 3 other rules: ``` -> scalafix github:spotify/scio/AddMissingImports +> scalafix https://raw.githubusercontent.com/spotify/scio/main/scalafix/rules/src/main/scala/fix/v0_7_0/AddMissingImports.scala [info] Running scalafix on 173 Scala sources [success] Total time: 16 s, completed Oct 17, 2018 12:01:31 PM -> scalafix github:spotify/scio/RewriteSysProp +> scalafix https://raw.githubusercontent.com/spotify/scio/main/scalafix/rules/src/main/scala/fix/v0_7_0/RewriteSysProp.scala [info] Running scalafix on 173 Scala sources [success] Total time: 6 s, completed Oct 17, 2018 12:34:00 PM -> scalafix github:spotify/scio/BQClientRefactoring +> scalafix https://raw.githubusercontent.com/spotify/scio/main/scalafix/rules/src/main/scala/fix/v0_7_0/BQClientRefactoring.scala [info] Running scalafix on 173 Scala sources [success] Total time: 3 s, completed Oct 17, 2018 12:34:20 PM ``` @@ -261,7 +261,7 @@ In the process of upgrading Scio, you may encounter the following error: > Cannot find a Coder instance for type T -If you've defined a generic function that uses a `SCollection`, this function is likely to need a `Coder[T]`. Scio will require you to provide an implicit `Coder[T]`. You can read about Scala implicit parameters [here](https://docs.scala-lang.org/tour/implicit-parameters.html) +If you've defined a generic function that uses an `SCollection`, this function is likely to need a `Coder[T]`. Scio will require you to provide an implicit `Coder[T]`. You can read about Scala implicit parameters [here](https://docs.scala-lang.org/tour/implicit-parameters.html) Let's see a simple example. Say I created the following method `doSomething`: @@ -345,7 +345,7 @@ You can fix this warning in two ways: In both cases you want to define a Coder in your own code. The only difference is how you'll implement it. -Let's say you are using a `SCollection[java.util.Locale]`: +Let's say you are using an `SCollection[java.util.Locale]`: ```scala mdoc:reset:silent import com.spotify.scio.values.SCollection diff --git a/site/src/main/paradox/migrations/v0.8.0-Migration-Guide.md b/site/src/main/paradox/releases/migrations/v0.8.0-Migration-Guide.md similarity index 92% rename from site/src/main/paradox/migrations/v0.8.0-Migration-Guide.md rename to site/src/main/paradox/releases/migrations/v0.8.0-Migration-Guide.md index 7001828bfe..9ebd29f00a 100644 --- a/site/src/main/paradox/migrations/v0.8.0-Migration-Guide.md +++ b/site/src/main/paradox/releases/migrations/v0.8.0-Migration-Guide.md @@ -17,7 +17,7 @@ Beam SQL integration is added in this release! This integration comes in many flavors, from fluent api to string interpolation with both offering the possibility to typecheck the provided query at compile time. -A simple use case of this api is reflected in the example below. This example uses the fluent api to query the `SCollection[User]` and extract `username` and `age`. @scaladoc[`query`](com.spotify.scio.sql.SqlCollection1) return's @javadoc[Row](org.apache.beam.sdk.values.Row) which is `Beam`'s underlying type that contains the values and the @javadoc[Schema](org.apache.beam.sdk.values.Schema) of the extracted data. +A simple use case of this api is reflected in the example below. This example uses the fluent api to query the `SCollection[User]` and extract `username` and `age`. @scaladoc[query](com.spotify.scio.sql.SqlCollection1) return's @javadoc[Row](org.apache.beam.sdk.values.Row) which is `Beam`'s underlying type that contains the values and the @javadoc[Schema](org.apache.beam.sdk.values.Schema) of the extracted data. ```scala import com.spotify.scio.sql._ @@ -202,7 +202,7 @@ BigQuery Storage API provides fast access to BigQuery managed storage by using a If you already use BigQuery, the BigQuery Storage api that we provide will look very familiar as it provides the standard and the type safe api. Switching to this new strategy should be very straightforward. -Using the type safe api is almost the same as the previous provided strategies. We just need to use @scaladoc[`@BigQueryType.fromStorage`](com.spotify.scio.bigquery.types.BigQueryType$$fromStorage). The example below retrieves all columns from a given table. +Using the type safe api is almost the same as the previous provided strategies. We just need to use @scaladoc[@BigQueryType.fromStorage](com.spotify.scio.bigquery.types.BigQueryType$$fromStorage). The example below retrieves all columns from a given table. ```scala import com.spotify.scio.bigquery._ @@ -309,7 +309,7 @@ sealed trait SCollection[T] extends PCollectionWrapper[T] { } ``` -@scaladoc[`ClosedTap[T]`](com.spotify.scio.io.ClosedTap) encapsulates the IO @scaladoc[`Tap[T]`](com.spotify.scio.io.Tap) and it's only possible to read from it once the pipeline execution is done. This is demonstrated in the following example: +@scaladoc[ClosedTap[T]](com.spotify.scio.io.ClosedTap) encapsulates the IO @scaladoc[Tap[T]](com.spotify.scio.io.Tap) and it's only possible to read from it once the pipeline execution is done. This is demonstrated in the following example: ```scala mdoc:reset import com.spotify.scio._ @@ -351,7 +351,7 @@ def scioResult(sc: ScioContext): ScioResult = sc.run().waitUntilDone(Duration.In ### Remove tensorflow methods related to schema inference -In scio 0.7.0 `scio-tensorflow` saw some of it's operations being deprecated. +In scio 0.7.0 `scio-tensorflow` saw some of its operations being deprecated. They are no longer available in this version and we recommend users to use TensorFlow Data Validation instead. Removed operations: @@ -401,12 +401,12 @@ The advantage of this over the previous usage of `String` or `TableReference` is Async `DoFn`s were refactored. -`AsyncLookupDoFn` was renamed to @scaladoc[`BaseAsyncLookupDoFn`](com.spotify.scio.transforms.BaseAsyncLookupDoFn) and we now have better support for `Guava`, `Java 8` and scala `Future` lookup DoFn's through the following implementations @scaladoc[`GuavaAsyncLookupDoFn`](com.spotify.scio.transforms.GuavaAsyncLookupDoFn), @scaladoc[`JavaAsyncLookupDoFn`](com.spotify.scio.transforms.JavaAsyncLookupDoFn) and @scaladoc[`ScalaAsyncLookupDoFn`](com.spotify.scio.transforms.ScalaAsyncLookupDoFn). +`AsyncLookupDoFn` was renamed to @scaladoc[BaseAsyncLookupDoFn](com.spotify.scio.transforms.BaseAsyncLookupDoFn) and we now have better support for `Guava`, `Java 8` and scala `Future` lookup DoFn's through the following implementations @scaladoc[GuavaAsyncLookupDoFn](com.spotify.scio.transforms.GuavaAsyncLookupDoFn), @scaladoc[JavaAsyncLookupDoFn](com.spotify.scio.transforms.JavaAsyncLookupDoFn) and @scaladoc[ScalaAsyncLookupDoFn](com.spotify.scio.transforms.ScalaAsyncLookupDoFn). ### Remove support for lisp-case CLI arguments In order to be consistent with Beam's way of passing arguments into the application and construct -@javadoc[`PipelineOptions`](org.apache.beam.sdk.options.PipelineOptions), we decided to drop support +@javadoc[PipelineOptions](org.apache.beam.sdk.options.PipelineOptions), we decided to drop support for `lisp-case` arguments. What this means is that if you were passing arguments like `--foo-bar` now you need to pass it as `--fooBar`. diff --git a/site/src/main/paradox/migrations/v0.9.0-Migration-Guide.md b/site/src/main/paradox/releases/migrations/v0.9.0-Migration-Guide.md similarity index 87% rename from site/src/main/paradox/migrations/v0.9.0-Migration-Guide.md rename to site/src/main/paradox/releases/migrations/v0.9.0-Migration-Guide.md index 2449f8c3ad..366e2ac30e 100644 --- a/site/src/main/paradox/migrations/v0.9.0-Migration-Guide.md +++ b/site/src/main/paradox/releases/migrations/v0.9.0-Migration-Guide.md @@ -29,7 +29,7 @@ The switch also adds the following benefits: Previously `Hash128[K]` only provides instances for `Int`, `Long`, `String`, `Array[Byte]`, `Array[Int]` and `Array[Long]`, while `magnolify-guava` can derive `Funnel[K]` for most common types including tuples, case classes, etc. -We also added an [`ApproxFilter`](https://spotify.github.io/scio/api/com/spotify/scio/hash/index.html) abstraction to allow extensible approximate filter implementations. [`BloomFilter`](https://spotify.github.io/scio/api/com/spotify/scio/hash/BloomFilter$.html) extends `ApproxFilter` and allows us to create filters & side inputs from `Iterable[T]` & `SCollection[T]`. The result filter instances are serializable. For example: +We also added an @scaladoc[ApproxFilter](com.spotify.scio.hash.ApproxFilter) abstraction to allow extensible approximate filter implementations. @scaladoc[BloomFilter](com.spotify.scio.hash.BloomFilter) extends `ApproxFilter` and allows us to create filters & side inputs from `Iterable[T]` & `SCollection[T]`. The result filter instances are serializable. For example: ```scala import com.spotify.scio._ @@ -52,7 +52,7 @@ val bfCoder: Coder[BloomFilter[String]] = BloomFilter.filterCoder ### BigQuery -In `scio` `0.8.0` we introduced some [deprecations](https://spotify.github.io/scio/migrations/v0.8.0-Migration-Guide.html#bigquery) and with this version, we are enforcing them. What this means is that all `BigQuery` operations should expect a `Table` type that can be created either from a table reference or spec: +In `scio` `0.8.0` we introduced some @ref[deprecations](v0.8.0-Migration-Guide.md#bigquery) and with this version, we are enforcing them. What this means is that all `BigQuery` operations should expect a `Table` type that can be created either from a table reference or spec: ```scala def tableSpecString: String = ??? diff --git a/site/src/main/paradox/releases/v0.12.0.md b/site/src/main/paradox/releases/v0.12.0.md index 6eebe08f73..26681ee520 100644 --- a/site/src/main/paradox/releases/v0.12.0.md +++ b/site/src/main/paradox/releases/v0.12.0.md @@ -1,7 +1,7 @@ # v0.12.0 Release Blog Scio 0.12.0 contains many new features, performance improvements, and a few breaking changes. -You can find the technical Migration Guide @ref:[here](../migrations/v0.12.0-Migration-Guide.md) with +You can find the technical Migration Guide @ref:[here](migrations/v0.12.0-Migration-Guide.md) with code samples and Scalafix instructions, and the full release notes are [here](https://github.com/spotify/scio/releases/tag/v0.12.0). ## New Features @@ -63,7 +63,7 @@ import com.spotify.scio.parquet._ sc.typedParquetFile[T](path, conf = ParquetConfiguration.of("scio.parquet.read.useSplittableDoFn" -> true)) ``` -You can find more information, and other migration options, @ref:[here](../migrations/v0.12.0-Migration-Guide.md#parquet-reads). +You can find more information, and other migration options, @ref:[here](migrations/v0.12.0-Migration-Guide.md#parquet-reads). ### GRPC Lookup API Scio 0.12.0 includes a new artifact, `scio-grpc`, that provides a custom `AsyncLookupDoFn` implementation specifically for GRPC service lookups. @@ -73,7 +73,7 @@ Both unary and server-streaming lookups are supported. import com.spotify.scio.grpc._ data - .map { case (str1, str2) => ConcatRequest.newBuilder.setStr1(str1).setStr2(str2).build) + .map { case (str1, str2) => ConcatRequest.newBuilder.setStr1(str1).setStr2(str2).build } .grpcLookup[ConcatResponse, ConcatServiceFutureStub]( () => NettyChannelBuilder.forTarget(ServiceUri).usePlaintext().build(), ConcatServiceGrpc.newFutureStub, @@ -176,8 +176,8 @@ You can see a full list on the [release notes](https://github.com/spotify/scio/r Scio 0.12.0 has a few breaking changes. The most impactful changes include: -- [Pubsub read API changes](https://spotify.github.io/scio/migrations/v0.12.0-Migration-Guide.html#removal-of-com-spotify-scio-pubsub-specializations) -- [scio-extra bigquery removal](https://spotify.github.io/scio/migrations/v0.12.0-Migration-Guide.html#com-spotify-scio-extra-bigquery-removal) -- [Parquet's saveAsDynamicParquetAvroFile removed in favor of saveAsParquetAvroFile](https://spotify.github.io/scio/migrations/v0.12.0-Migration-Guide.html#parquetio-saveasdynamicparquetavrofile-saveasparquetavrofile) +- @ref[Pubsub read API changes](migrations/v0.12.0-Migration-Guide.md#removal-of-com-spotify-scio-pubsub-specializations) +- @ref[scio-extra bigquery removal](migrations/v0.12.0-Migration-Guide.md#com-spotify-scio-extra-bigquery-removal) +- @ref[Parquet's saveAsDynamicParquetAvroFile removed in favor of saveAsParquetAvroFile](migrations/v0.12.0-Migration-Guide.md#parquetio-saveasdynamicparquetavrofile-saveasparquetavrofile) -A full list of breaking changes can be found on our @ref:[Migration Guide](../migrations/v0.12.0-Migration-Guide.md). \ No newline at end of file +A full list of breaking changes can be found on our @ref:[Migration Guide](migrations/v0.12.0-Migration-Guide.md). \ No newline at end of file