Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

standardise on UTF-8 when converting strings to/from bytes #839

Merged
merged 3 commits into from
May 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import org.apache.avro.generic.GenericData
import org.apache.avro.util.Utf8

import java.nio.ByteBuffer
import java.nio.charset.StandardCharsets
import java.util.UUID

trait StringEncoders:
Expand Down Expand Up @@ -37,13 +38,15 @@ object UTF8StringEncoder extends Encoder[String] :
* An [[Encoder]] for Strings that encodes as [[ByteBuffer]]s.
*/
object ByteStringEncoder extends Encoder[String] :
override def encode(schema: Schema): String => Any = string => ByteBuffer.wrap(string.getBytes)
override def encode(schema: Schema): String => Any = string =>
ByteBuffer.wrap(string.getBytes(StandardCharsets.UTF_8))

/**
* An [[Encoder]] for Strings that encodes as [[GenericFixed]]s.
*/
object FixedStringEncoder extends Encoder[String] :
override def encode(schema: Schema): String => Any = string =>
if (string.getBytes.length > schema.getFixedSize)
throw new Avro4sEncodingException(s"Cannot write string with ${string.getBytes.length} bytes to fixed type of size ${schema.getFixedSize}")
GenericData.get.createFixed(null, ByteBuffer.allocate(schema.getFixedSize).put(string.getBytes).array, schema).asInstanceOf[GenericData.Fixed]
val bytes = string.getBytes(StandardCharsets.UTF_8)
if (bytes.length > schema.getFixedSize)
throw new Avro4sEncodingException(s"Cannot write string with ${bytes.length} bytes to fixed type of size ${schema.getFixedSize}")
GenericData.get.createFixed(null, ByteBuffer.allocate(schema.getFixedSize).put(bytes).array, schema).asInstanceOf[GenericData.Fixed]
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ import com.sksamuel.avro4s.AvroSchema
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

import java.nio.charset.StandardCharsets

/**
* Tests created from README examples
*
Expand Down Expand Up @@ -51,7 +53,7 @@ class ReadMeExamples extends AnyWordSpec with Matchers {

json shouldBe ("{\"name\":\"ennio morricone\",\"birthplace\":\"rome\",\"compositions\":[\"legend of 1900\",\"ecstasy of gold\"]}")

val in = new ByteArrayInputStream(json.getBytes("UTF-8"))
val in = new ByteArrayInputStream(json.getBytes(StandardCharsets.UTF_8))
val schema = AvroSchema[Composer]
val input = AvroInputStream.json[Composer].from(in).build(schema)
val result = input.iterator.toSeq
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers

import java.nio.ByteBuffer
import java.nio.charset.StandardCharsets

class StringDecoderTest extends AnyFunSuite with Matchers {

Expand Down Expand Up @@ -43,14 +44,14 @@ class StringDecoderTest extends AnyFunSuite with Matchers {
test("decode from byte buffers to strings") {
val schema = AvroSchema[FooString]
val record = new GenericData.Record(schema)
record.put("str", ByteBuffer.wrap("hello".getBytes))
record.put("str", ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)))
Decoder[FooString].decode(schema).apply(record) shouldBe FooString("hello")
}

test("decode from byte arrays to strings") {
val schema = AvroSchema[FooString]
val record = new GenericData.Record(schema)
record.put("str", "hello".getBytes)
record.put("str", "hello".getBytes(StandardCharsets.UTF_8))
Decoder[FooString].decode(schema).apply(record) shouldBe FooString("hello")
}
}
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
package com.sksamuel.avro4s.record.encoder

import java.nio.ByteBuffer
import com.sksamuel.avro4s.{AvroSchema, Encoder, SchemaFor}
import org.apache.avro.SchemaBuilder
import org.apache.avro.generic.{GenericFixed, GenericRecord}
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers

import java.nio.ByteBuffer
import java.nio.charset.StandardCharsets

class ByteArrayEncoderTest extends AnyFunSuite with Matchers {

test("encode byte arrays as BYTES type") {
Expand Down Expand Up @@ -72,7 +74,7 @@ class ByteArrayEncoderTest extends AnyFunSuite with Matchers {
val schema = SchemaBuilder.fixed("foo").size(7)
val fixed = Encoder[Array[Byte]]
.encode(schema)
.apply("hello".getBytes)
.apply("hello".getBytes(StandardCharsets.UTF_8))
.asInstanceOf[GenericFixed]
fixed.bytes().toList shouldBe Seq(104, 101, 108, 108, 111, 0, 0)
fixed.bytes().length shouldBe 7
Expand All @@ -82,7 +84,7 @@ class ByteArrayEncoderTest extends AnyFunSuite with Matchers {
val schema = SchemaBuilder.fixed("foo").size(7)
val fixed = Encoder[ByteBuffer]
.encode(schema)
.apply(ByteBuffer.wrap("hello".getBytes))
.apply(ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)))
.asInstanceOf[GenericFixed]
fixed.bytes().toList shouldBe Seq(104, 101, 108, 108, 111, 0, 0)
fixed.bytes().length shouldBe 7
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package com.sksamuel.avro4s.streams.output

import java.io.ByteArrayOutputStream
import java.nio.charset.StandardCharsets

import com.sksamuel.avro4s.{AvroOutputStream, AvroSchema, Encoder}
import org.apache.avro.file.CodecFactory
import org.scalatest.matchers.should.Matchers
Expand All @@ -18,24 +20,24 @@ class AvroDataOutputStreamCodecTest extends AnyWordSpec with Matchers {
val output = AvroOutputStream.data[Composer](schema, Encoder[Composer]).to(baos).build()
output.write(ennio)
output.close()
new String(baos.toByteArray) should include("birthplace")
new String(baos.toByteArray) should include("compositions")
baos.toString(StandardCharsets.UTF_8.name()) should include("birthplace")
baos.toString(StandardCharsets.UTF_8.name()) should include("compositions")
}

"include deflate coded in metadata when serialized with deflate" in {
val baos = new ByteArrayOutputStream()
val output = AvroOutputStream.data[Composer](schema, Encoder[Composer]).to(baos).withCodec(CodecFactory.deflateCodec(CodecFactory.DEFAULT_DEFLATE_LEVEL)).build()
output.write(ennio)
output.close()
new String(baos.toByteArray) should include("deflate")
baos.toString(StandardCharsets.UTF_8.name()) should include("deflate")
}

"include bzip2 coded in metadata when serialized with bzip2" in {
val baos = new ByteArrayOutputStream()
val output = AvroOutputStream.data[Composer](schema, Encoder[Composer]).to(baos).withCodec(CodecFactory.bzip2Codec).build()
output.write(ennio)
output.close()
new String(baos.toByteArray) should include("bzip2")
baos.toString(StandardCharsets.UTF_8.name()) should include("bzip2")
}
}
}
Loading