Refactor BQ to expose all beam's configurations

spotify · Aug 19, 2024 · d64e382 · d64e382
1 parent 27fd3ca
commit d64e382
Show file tree

Hide file tree

Showing 40 changed files with 832 additions and 1,141 deletions.
diff --git a/integration/src/test/scala/com/spotify/scio/bigquery/BigQueryClientIT.scala b/integration/src/test/scala/com/spotify/scio/bigquery/BigQueryClientIT.scala
@@ -154,7 +154,7 @@ class BigQueryClientIT extends AnyFlatSpec with Matchers {
 
   "TableService.getRows" should "work" in {
     val rows =
-      bq.tables.rows(Table.Spec("bigquery-public-data:samples.shakespeare")).take(10).toList
+      bq.tables.rows(Table("bigquery-public-data:samples.shakespeare")).take(10).toList
     val columns = Set("word", "word_count", "corpus", "corpus_date")
     all(rows.map(_.keySet().asScala)) shouldBe columns
   }

diff --git a/integration/src/test/scala/com/spotify/scio/bigquery/BigQueryIOIT.scala b/integration/src/test/scala/com/spotify/scio/bigquery/BigQueryIOIT.scala
@@ -45,7 +45,7 @@ class BigQueryIOIT extends PipelineSpec {
 
   "Select" should "read typed values from a SQL query" in
     runWithRealContext(options) { sc =>
-      val scoll = sc.read(BigQueryTyped[ShakespeareFromQuery])
+      val scoll = sc.typedBigQueryStorage[ShakespeareFromQuery]()
       scoll should haveSize(10)
       scoll should satisfy[ShakespeareFromQuery] {
         _.forall(_.getClass == classOf[ShakespeareFromQuery])
@@ -54,7 +54,7 @@ class BigQueryIOIT extends PipelineSpec {
 
   "TableRef" should "read typed values from table" in
     runWithRealContext(options) { sc =>
-      val scoll = sc.read(BigQueryTyped[ShakespeareFromTable])
+      val scoll = sc.typedBigQueryStorage[ShakespeareFromTable]()
       scoll.take(10) should haveSize(10)
       scoll should satisfy[ShakespeareFromTable] {
         _.forall(_.getClass == classOf[ShakespeareFromTable])

diff --git a/integration/src/test/scala/com/spotify/scio/bigquery/TypedBigQueryIT.scala b/integration/src/test/scala/com/spotify/scio/bigquery/TypedBigQueryIT.scala
@@ -20,7 +20,6 @@ package com.spotify.scio.bigquery
 import com.google.protobuf.ByteString
 import com.spotify.scio._
 import com.spotify.scio.avro._
-import com.spotify.scio.bigquery.BigQueryTypedTable.Format
 import com.spotify.scio.bigquery.client.BigQuery
 import com.spotify.scio.testing._
 import magnolify.scalacheck.auto._
@@ -69,7 +68,7 @@ object TypedBigQueryIT {
     val now = Instant.now().toString(TIME_FORMATTER)
     val spec =
       s"data-integration-test:bigquery_avro_it.$name${now}_${Random.nextInt(Int.MaxValue)}"
-    Table.Spec(spec)
+    Table(spec)
   }
   private val tableRowTable = table("records_tablerow")
   private val avroTable = table("records_avro")
@@ -101,37 +100,25 @@ class TypedBigQueryIT extends PipelineSpec with BeforeAndAfterAll {
     BigQuery.defaultInstance().tables.delete(avroLogicalTypeTable.ref)
   }
 
-  "TypedBigQuery" should "read records" in {
+  "typedBigQuery" should "read records" in {
     val sc = ScioContext(options)
     sc.typedBigQuery[Record](tableRowTable) should containInAnyOrder(records)
     sc.run()
   }
 
-  it should "convert to avro format" in {
+  "bigQueryTableFormat" should "read TableRow records" in {
     val sc = ScioContext(options)
-    implicit val coder = avroGenericRecordCoder(Record.avroSchema)
-    sc.typedBigQuery[Record](tableRowTable)
-      .map(Record.toAvro)
-      .map(Record.fromAvro) should containInAnyOrder(
-      records
-    )
+    val format = BigQueryIO.Format.Default(BigQueryType[Record])
+    val data = sc.bigQueryTableFormat(tableRowTable, format)
+    data should containInAnyOrder(records)
     sc.run()
   }
 
-  "BigQueryTypedTable" should "read TableRow records" in {
+  it should "read GenericRecord records" in {
     val sc = ScioContext(options)
-    sc
-      .bigQueryTable(tableRowTable)
-      .map(Record.fromTableRow) should containInAnyOrder(records)
-    sc.run()
-  }
-
-  it should "read GenericRecord recors" in {
-    val sc = ScioContext(options)
-    implicit val coder = avroGenericRecordCoder(Record.avroSchema)
-    sc
-      .bigQueryTable(tableRowTable, Format.GenericRecord)
-      .map(Record.fromAvro) should containInAnyOrder(records)
+    val format = BigQueryIO.Format.Avro(BigQueryType[Record])
+    val data = sc.bigQueryTableFormat(tableRowTable, format)
+    data should containInAnyOrder(records)
     sc.run()
   }
 
@@ -157,7 +144,7 @@ class TypedBigQueryIT extends PipelineSpec with BeforeAndAfterAll {
         |}
       """.stripMargin)
     val tap = sc
-      .bigQueryTable(tableRowTable, Format.GenericRecord)
+      .bigQueryTableFormat(tableRowTable, BigQueryIO.Format.Avro())
       .saveAsBigQueryTable(avroTable, schema = schema, createDisposition = CREATE_IF_NEEDED)
 
     val result = sc.run().waitUntilDone()

diff --git a/integration/src/test/scala/com/spotify/scio/bigquery/types/BigQueryStorageIT.scala b/integration/src/test/scala/com/spotify/scio/bigquery/types/BigQueryStorageIT.scala
@@ -155,8 +155,10 @@ class BigQueryStorageIT extends AnyFlatSpec with Matchers {
     val (sc, _) = ContextAndArgs(
       Array("--project=data-integration-test", "--tempLocation=gs://data-integration-test-eu/temp")
     )
+    val bqt = BigQueryType[NestedWithRestriction]
+    val source = Table(bqt.table.get, "required.int < 3")
     val p = sc
-      .typedBigQueryStorage[NestedWithRestriction](rowRestriction = "required.int < 3")
+      .typedBigQueryStorage[NestedWithRestriction](source)
       .map { r =>
         val (req, opt, rep) = (r.required, r.optional.get, r.repeated.head)
         (req.int, req.string, opt.int, opt.string, rep.int, rep.string)
@@ -172,7 +174,7 @@ class BigQueryStorageIT extends AnyFlatSpec with Matchers {
       Array("--project=data-integration-test", "--tempLocation=gs://data-integration-test-eu/temp")
     )
     val p = sc
-      .typedBigQuery[NestedWithAll](Table.Spec(NestedWithAll.table.format("nested")))
+      .typedBigQuery[NestedWithAll](Table(NestedWithAll.table.format("nested")))
       .map(r => (r.required.int, r.required.string, r.optional.get.int))
       .internal
     PAssert.that(p).containsInAnyOrder(expected)
@@ -243,7 +245,7 @@ class BigQueryStorageIT extends AnyFlatSpec with Matchers {
       Array("--project=data-integration-test", "--tempLocation=gs://data-integration-test-eu/temp")
     )
     val p = sc
-      .typedBigQueryStorage[ToTableRequired](Table.Spec("data-integration-test:storage.required"))
+      .typedBigQueryStorage[ToTableRequired](Table("data-integration-test:storage.required"))
       .internal
     PAssert.that(p).containsInAnyOrder(expected)
     sc.run()

diff --git a/integration/src/test/scala/com/spotify/scio/bigquery/types/BigQueryTypeIT.scala b/integration/src/test/scala/com/spotify/scio/bigquery/types/BigQueryTypeIT.scala
@@ -187,10 +187,9 @@ class BigQueryTypeIT extends AnyFlatSpec with Matchers {
     tableReference.setProjectId("data-integration-test")
     tableReference.setDatasetId("partition_a")
     tableReference.setTableId("table_$LATEST")
-    Table.Ref(tableReference).latest().ref.getTableId shouldBe "table_20170302"
+    Table(tableReference).latest().ref.getTableId shouldBe "table_20170302"
 
-    Table
-      .Spec("data-integration-test:partition_a.table_$LATEST")
+    Table("data-integration-test:partition_a.table_$LATEST")
       .latest()
       .ref
       .getTableId shouldBe "table_20170302"
@@ -210,7 +209,7 @@ class BigQueryTypeIT extends AnyFlatSpec with Matchers {
     val bqt = BigQueryType[FromTableT]
     bqt.isQuery shouldBe false
     bqt.isTable shouldBe true
-    bqt.query shouldBe None
+    bqt.queryRaw shouldBe None
     bqt.table shouldBe Some("bigquery-public-data:samples.shakespeare")
     val fields = bqt.schema.getFields.asScala
     fields.size shouldBe 4

diff --git a/scio-core/src/main/scala/com/spotify/scio/ScioContext.scala b/scio-core/src/main/scala/com/spotify/scio/ScioContext.scala
@@ -51,6 +51,8 @@ import scala.reflect.ClassTag
 import scala.util.control.NoStackTrace
 import scala.util.{Failure, Success, Try}
 import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions
+import org.apache.beam.sdk.transforms.errorhandling.BadRecord
+import org.apache.beam.sdk.transforms.errorhandling.ErrorHandler.BadRecordErrorHandler
 
 /** Runner specific context. */
 trait RunnerContext {
@@ -851,6 +853,27 @@ class ScioContext private[scio] (
       this.applyTransform(Create.timestamped(v.asJava).withCoder(coder))
     }
 
+  // =======================================================================
+  // Error handler
+  // =======================================================================
+  def registerBadRecordErrorHandler[O <: POutput](
+    sinkTransform: PTransform[PCollection[BadRecord], O]
+  ): BadRecordErrorHandler[O] =
+    pipeline.registerBadRecordErrorHandler(sinkTransform)
+
+  def badRecordErrorHandler(): (BadRecordErrorHandler[PCollectionTuple], SCollection[BadRecord]) = {
+    val tag = new TupleTag[BadRecord]()
+    val sideOutput = PCollectionTuple.empty(pipeline)
+    val sinkTransform = new PTransform[PCollection[BadRecord], PCollectionTuple] {
+      override def expand(input: PCollection[BadRecord]): PCollectionTuple =
+        sideOutput.and(tag, input)
+    }
+
+    val handler = pipeline.registerBadRecordErrorHandler(sinkTransform)
+    val errorOutput = wrap(sideOutput.get(tag))
+    (handler, errorOutput)
+  }
+
   // =======================================================================
   // Metrics
   // =======================================================================

diff --git a/scio-core/src/main/scala/com/spotify/scio/coders/instances/BeamTypeCoders.scala b/scio-core/src/main/scala/com/spotify/scio/coders/instances/BeamTypeCoders.scala
@@ -20,6 +20,7 @@ package com.spotify.scio.coders.instances
 import com.google.api.client.json.GenericJson
 import com.google.api.client.json.JsonObjectParser
 import com.google.api.client.json.gson.GsonFactory
+import com.spotify.scio.ScioContext
 import com.spotify.scio.coders.{Coder, CoderGrammar}
 import com.spotify.scio.util.ScioUtil
 
@@ -29,6 +30,7 @@ import org.apache.beam.sdk.io.FileIO.ReadableFile
 import org.apache.beam.sdk.io.fs.{MatchResult, MetadataCoderV2, ResourceId, ResourceIdCoder}
 import org.apache.beam.sdk.io.ReadableFileCoder
 import org.apache.beam.sdk.schemas.{Schema => BSchema}
+import org.apache.beam.sdk.transforms.errorhandling.BadRecord
 import org.apache.beam.sdk.transforms.windowing.{
   BoundedWindow,
   GlobalWindow,
@@ -66,6 +68,8 @@ trait BeamTypeCoders extends CoderGrammar {
       str => DefaultJsonObjectParser.parseAndClose(new StringReader(str), ScioUtil.classOf[T]),
       DefaultJsonObjectParser.getJsonFactory().toString(_)
     )
+
+  def badRecordCoder(sc: ScioContext): Coder[BadRecord] = beam(BadRecord.getCoder(sc.pipeline))
 }
 
 private[coders] object BeamTypeCoders extends BeamTypeCoders {

diff --git a/scio-examples/src/main/scala/com/spotify/scio/examples/complete/AutoComplete.scala b/scio-examples/src/main/scala/com/spotify/scio/examples/complete/AutoComplete.scala
@@ -145,7 +145,7 @@ object AutoComplete {
     if (outputToBigqueryTable) {
       tags
         .map(kv => Record(kv._1, kv._2.map(p => Tag(p._1, p._2)).toList))
-        .saveAsTypedBigQueryTable(Table.Spec(args("output")))
+        .saveAsTypedBigQueryTable(Table(args("output")))
     }
     if (outputToDatastore) {
       val kind = args.getOrElse("kind", "autocomplete-demo")

diff --git a/scio-examples/src/main/scala/com/spotify/scio/examples/complete/StreamingWordExtract.scala b/scio-examples/src/main/scala/com/spotify/scio/examples/complete/StreamingWordExtract.scala
@@ -49,7 +49,7 @@ object StreamingWordExtract {
       .flatMap(_.split("[^a-zA-Z']+").filter(_.nonEmpty))
       .map(_.toUpperCase)
       .map(s => TableRow("string_field" -> s))
-      .saveAsBigQueryTable(Table.Spec(args("output")), schema)
+      .saveAsBigQueryTable(Table(args("output")), schema)
 
     val result = sc.run()
     exampleUtils.waitToFinish(result.pipelineResult)

diff --git a/scio-examples/src/main/scala/com/spotify/scio/examples/complete/TrafficMaxLaneFlow.scala b/scio-examples/src/main/scala/com/spotify/scio/examples/complete/TrafficMaxLaneFlow.scala
@@ -126,7 +126,7 @@ object TrafficMaxLaneFlow {
           ts
         )
       }
-      .saveAsTypedBigQueryTable(Table.Spec(args("output")))
+      .saveAsTypedBigQueryTable(Table(args("output")))
 
     val result = sc.run()
     exampleUtils.waitToFinish(result.pipelineResult)

diff --git a/scio-examples/src/main/scala/com/spotify/scio/examples/complete/TrafficRoutes.scala b/scio-examples/src/main/scala/com/spotify/scio/examples/complete/TrafficRoutes.scala
@@ -111,7 +111,7 @@ object TrafficRoutes {
       .map { case (r, ts) =>
         Record(r.route, r.avgSpeed, r.slowdownEvent, ts)
       }
-      .saveAsTypedBigQueryTable(Table.Spec(args("output")))
+      .saveAsTypedBigQueryTable(Table(args("output")))
 
     val result = sc.run()
     exampleUtils.waitToFinish(result.pipelineResult)

diff --git a/scio-examples/src/main/scala/com/spotify/scio/examples/complete/game/GameStats.scala b/scio-examples/src/main/scala/com/spotify/scio/examples/complete/game/GameStats.scala
@@ -113,7 +113,7 @@ object GameStats {
       // Done using windowing information, convert back to regular `SCollection`
       .toSCollection
       // Save to the BigQuery table defined by "output" in the arguments passed in + "_team" suffix
-      .saveAsTypedBigQueryTable(Table.Spec(args("output") + "_team"))
+      .saveAsTypedBigQueryTable(Table(args("output") + "_team"))
 
     userEvents
       // Window over a variable length of time - sessions end after sessionGap minutes no activity
@@ -141,7 +141,7 @@ object GameStats {
         AvgSessionLength(mean, fmt.print(w.start()))
       }
       // Save to the BigQuery table defined by "output" + "_sessions" suffix
-      .saveAsTypedBigQueryTable(Table.Spec(args("output") + "_sessions"))
+      .saveAsTypedBigQueryTable(Table(args("output") + "_sessions"))
 
     // Execute the pipeline
     val result = sc.run()

diff --git a/scio-examples/src/main/scala/com/spotify/scio/examples/complete/game/HourlyTeamScore.scala b/scio-examples/src/main/scala/com/spotify/scio/examples/complete/game/HourlyTeamScore.scala
@@ -91,7 +91,7 @@ object HourlyTeamScore {
         TeamScoreSums(team, score, start)
       }
       // Save to the BigQuery table defined by "output" in the arguments passed in
-      .saveAsTypedBigQueryTable(Table.Spec(args("output")))
+      .saveAsTypedBigQueryTable(Table(args("output")))
 
     // Execute the pipeline
     sc.run()

diff --git a/scio-examples/src/main/scala/com/spotify/scio/examples/complete/game/LeaderBoard.scala b/scio-examples/src/main/scala/com/spotify/scio/examples/complete/game/LeaderBoard.scala
@@ -96,7 +96,7 @@ object LeaderBoard {
       // Done with windowing information, convert back to regular `SCollection`
       .toSCollection
       // Save to the BigQuery table defined by "output" in the arguments passed in + "_team" suffix
-      .saveAsTypedBigQueryTable(Table.Spec(args("output") + "_team"))
+      .saveAsTypedBigQueryTable(Table(args("output") + "_team"))
 
     gameEvents
       // Use a global window for unbounded data, which updates calculation every 10 minutes,
@@ -126,7 +126,7 @@ object LeaderBoard {
       // Map summed results from tuples into `UserScoreSums` case class, so we can save to BQ
       .map(kv => UserScoreSums(kv._1, kv._2, fmt.print(Instant.now())))
       // Save to the BigQuery table defined by "output" in the arguments passed in + "_user" suffix
-      .saveAsTypedBigQueryTable(Table.Spec(args("output") + "_user"))
+      .saveAsTypedBigQueryTable(Table(args("output") + "_user"))
 
     // Execute the pipeline
     val result = sc.run()

diff --git a/scio-examples/src/main/scala/com/spotify/scio/examples/complete/game/UserScore.scala b/scio-examples/src/main/scala/com/spotify/scio/examples/complete/game/UserScore.scala
@@ -62,7 +62,7 @@ object UserScore {
       // Map summed results from tuples into `UserScoreSums` case class, so we can save to BQ
       .map(UserScoreSums.tupled)
       // Save to the BigQuery table defined by "output" in the arguments passed in
-      .saveAsTypedBigQueryTable(Table.Spec(args("output")))
+      .saveAsTypedBigQueryTable(Table(args("output")))
 
     // Execute the pipeline
     sc.run()

diff --git a/scio-examples/src/main/scala/com/spotify/scio/examples/cookbook/BigQueryTornadoes.scala b/scio-examples/src/main/scala/com/spotify/scio/examples/cookbook/BigQueryTornadoes.scala
@@ -45,7 +45,7 @@ object BigQueryTornadoes {
     )
 
     // Open a BigQuery table as a `SCollection[TableRow]`
-    val table = Table.Spec(args.getOrElse("input", ExampleData.WEATHER_SAMPLES_TABLE))
+    val table = Table(args.getOrElse("input", ExampleData.WEATHER_SAMPLES_TABLE))
     val resultTap = sc
       .bigQueryTable(table)
       // Extract months with tornadoes
@@ -55,7 +55,7 @@ object BigQueryTornadoes {
       // Map `(Long, Long)` tuples into result `TableRow`s
       .map(kv => TableRow("month" -> kv._1, "tornado_count" -> kv._2))
       // Save result as a BigQuery table
-      .saveAsBigQueryTable(Table.Spec(args("output")), schema, WRITE_TRUNCATE, CREATE_IF_NEEDED)
+      .saveAsBigQueryTable(Table(args("output")), schema, WRITE_TRUNCATE, CREATE_IF_NEEDED)
 
     // Access the loaded tables
     resultTap

diff --git a/scio-examples/src/main/scala/com/spotify/scio/examples/cookbook/CombinePerKeyExamples.scala b/scio-examples/src/main/scala/com/spotify/scio/examples/cookbook/CombinePerKeyExamples.scala
@@ -47,7 +47,7 @@ object CombinePerKeyExamples {
     )
 
     // Open a BigQuery table as a `SCollection[TableRow]`
-    val table = Table.Spec(args.getOrElse("input", ExampleData.SHAKESPEARE_TABLE))
+    val table = Table(args.getOrElse("input", ExampleData.SHAKESPEARE_TABLE))
     sc.bigQueryTable(table)
       // Extract words and corresponding play names
       .flatMap { row =>
@@ -64,7 +64,7 @@ object CombinePerKeyExamples {
       // Map `(String, String)` tuples into result `TableRow`s
       .map(kv => TableRow("word" -> kv._1, "all_plays" -> kv._2))
       // Save result as a BigQuery table
-      .saveAsBigQueryTable(Table.Spec(args("output")), schema, WRITE_TRUNCATE, CREATE_IF_NEEDED)
+      .saveAsBigQueryTable(Table(args("output")), schema, WRITE_TRUNCATE, CREATE_IF_NEEDED)
 
     // Execute the pipeline
     sc.run()

diff --git a/scio-examples/src/main/scala/com/spotify/scio/examples/cookbook/DistinctByKeyExample.scala b/scio-examples/src/main/scala/com/spotify/scio/examples/cookbook/DistinctByKeyExample.scala
@@ -46,7 +46,7 @@ object DistinctByKeyExample {
     )
 
     // Open a BigQuery table as a `SCollection[TableRow]`
-    val table = Table.Spec(args.getOrElse("input", ExampleData.SHAKESPEARE_TABLE))
+    val table = Table(args.getOrElse("input", ExampleData.SHAKESPEARE_TABLE))
     sc.bigQueryTable(table)
       // Extract words and corresponding play names
       .flatMap { row =>
@@ -59,7 +59,7 @@ object DistinctByKeyExample {
       // Map `(String, String)` tuples into result `TableRow`s
       .map(kv => TableRow("word" -> kv._1, "reference_play" -> kv._2))
       // Save result as a BigQuery table
-      .saveAsBigQueryTable(Table.Spec(args("output")), schema, WRITE_TRUNCATE, CREATE_IF_NEEDED)
+      .saveAsBigQueryTable(Table(args("output")), schema, WRITE_TRUNCATE, CREATE_IF_NEEDED)
 
     // Execute the pipeline
     sc.run()

diff --git a/scio-examples/src/main/scala/com/spotify/scio/examples/cookbook/FilterExamples.scala b/scio-examples/src/main/scala/com/spotify/scio/examples/cookbook/FilterExamples.scala
@@ -51,7 +51,7 @@ object FilterExamples {
     val monthFilter = args.int("monthFilter", 7)
 
     // Open BigQuery table as a `SCollection[TableRow]`
-    val table = Table.Spec(args.getOrElse("input", ExampleData.WEATHER_SAMPLES_TABLE))
+    val table = Table(args.getOrElse("input", ExampleData.WEATHER_SAMPLES_TABLE))
     val pipe = sc
       .bigQueryTable(table)
       // Map `TableRow`s into `Record`s
@@ -81,7 +81,7 @@ object FilterExamples {
         TableRow("year" -> r.year, "month" -> r.month, "day" -> r.day, "mean_temp" -> r.meanTemp)
       }
       // Save result as a BigQuery table
-      .saveAsBigQueryTable(Table.Spec(args("output")), schema, WRITE_TRUNCATE, CREATE_IF_NEEDED)
+      .saveAsBigQueryTable(Table(args("output")), schema, WRITE_TRUNCATE, CREATE_IF_NEEDED)
 
     // Execute the pipeline
     sc.run()