Skip to content

Commit

Permalink
Add KtSoupParser, make KtSoupDocument a KtSoupElement
Browse files Browse the repository at this point in the history
  • Loading branch information
DrewCarlson committed Aug 1, 2023
1 parent c6d420a commit 1074e54
Show file tree
Hide file tree
Showing 16 changed files with 163 additions and 108 deletions.
10 changes: 2 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,7 @@

A multiplatform HTML5 parsing library built on [Lexbor](https://github.com/lexbor/lexbor), [Jsoup](https://jsoup.org/), and [node-html-parser](https://github.com/taoqf/node-html-parser).

TODO:
- Finalize initial API surface
- Add documentation and publishing workflows
- Extra: Add all-in-one fetch and parse API using Ktor

### Example
## Usage

```kotlin
val documentString = """
Expand All @@ -22,8 +17,7 @@ val documentString = """
</html>
"""

val document = KtSoupDocument()
document.parse(documentString)
val document = KtSoupParser.parse(documentString)
document.use { document ->
val div = document.getElementById("test")
println(div.textContent()) // output: Hello World
Expand Down
13 changes: 9 additions & 4 deletions ktsoup-core/api/ktsoup-core.api
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
public final class ktsoup/KtSoupDocument {
public fun <init> ()V
public final class ktsoup/KtSoupDocument : ktsoup/KtSoupElement {
public final fun body ()Lktsoup/KtSoupElement;
public final fun close ()V
public final fun getElementById (Ljava/lang/String;)Lktsoup/KtSoupElement;
public final fun getElementsByClass (Ljava/lang/String;)Ljava/util/List;
public final fun getElementsByTagName (Ljava/lang/String;)Ljava/util/List;
public final fun head ()Lktsoup/KtSoupElement;
public final fun parse (Ljava/lang/String;)Z
public final fun title ()Ljava/lang/String;
public final fun use (Lkotlin/jvm/functions/Function1;)Ljava/lang/Object;
}

public final class ktsoup/KtSoupElement : ktsoup/KtSoupNode {
public class ktsoup/KtSoupElement : ktsoup/KtSoupNode {
public final fun attr (Ljava/lang/String;)Ljava/lang/String;
public final fun attrs ()Ljava/util/Map;
public final fun className ()Ljava/lang/String;
public final fun id ()Ljava/lang/String;
public final fun querySelector (Ljava/lang/String;)Lktsoup/KtSoupElement;
public final fun querySelectorAll (Ljava/lang/String;)Ljava/util/List;
public final fun tagName ()Ljava/lang/String;
}

Expand Down Expand Up @@ -52,6 +52,11 @@ public final class ktsoup/KtSoupNodeType : java/lang/Enum {
public static fun values ()[Lktsoup/KtSoupNodeType;
}

public final class ktsoup/KtSoupParser {
public static final field INSTANCE Lktsoup/KtSoupParser;
public final fun parse (Ljava/lang/String;)Lktsoup/KtSoupDocument;
}

public final class ktsoup/KtSoupText : ktsoup/KtSoupNode {
}

31 changes: 2 additions & 29 deletions ktsoup-core/src/commonMain/kotlin/ktsoup/KtSoupDocument.kt
Original file line number Diff line number Diff line change
Expand Up @@ -16,24 +16,12 @@
*/
package ktsoup

internal const val ERROR_CALL_PARSE_FIRST = "`parse(html)` must be called before using a KtSoupDocument."
internal const val ERROR_DOCUMENT_CLOSED = "Th KtSoupDocument was closed and cannot be accessed again."

/**
* Represents a DOM document providing APIs to query and manipulate the document.
*
* **Important:** [parse] must be called before using any other methods in this class.
*/
public expect class KtSoupDocument() {

/**
* Parse the given [html] document.
*
* [parse] must be called first before using any other method or an
* [IllegalStateException] will be thrown.
*
* @return True if the document was parsed or false if parsing failed.
*/
public fun parse(html: String): Boolean
public expect class KtSoupDocument : KtSoupElement {

/**
* Get the document's title or an empty string if no title is found.
Expand All @@ -58,21 +46,6 @@ public expect class KtSoupDocument() {
*/
public fun head(): KtSoupElement?

/**
* Get the first [KtSoupElement] matching the css [selector] or null
* if there are no matches.
*
* @return The matching [KtSoupElement] or null.
*/
public fun querySelector(selector: String): KtSoupElement?

/**
* Get all [KtSoupElement]s matching the css [selector].
*
* @return A list of elements matching the [selector].
*/
public fun querySelectorAll(selector: String): List<KtSoupElement>

/**
* Get the first element with an `id` attribute matching [id] or null
* if there are no matching elements.
Expand Down
2 changes: 1 addition & 1 deletion ktsoup-core/src/commonMain/kotlin/ktsoup/KtSoupElement.kt
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ package ktsoup
/**
* A DOM element from a [KtSoupDocument].
*/
public expect class KtSoupElement : KtSoupNode {
public expect open class KtSoupElement : KtSoupNode {

/**
* Get the `id` attribute value or null if there is no `id` attribute.
Expand Down
30 changes: 30 additions & 0 deletions ktsoup-core/src/commonMain/kotlin/ktsoup/KtSoupParser.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
/**
* KtSoup
* Copyright (C) 2023 Drew Carlson
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ktsoup

/**
* The primary entrypoint for KtSoup, use [parse] to receive a [KtSoupDocument]
* to access and manipulate the provided HTML document.
*/
public expect object KtSoupParser {
/**
* Parse the given [html] document.
*
* @return The parsed document as a [KtSoupDocument].
*/
public fun parse(html: String): KtSoupDocument
}
12 changes: 6 additions & 6 deletions ktsoup-core/src/commonTest/kotlin/ktsoup/KtSoupDocumentTests.kt
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,12 @@ import kotlin.test.*
class KtSoupDocumentTests {

@Test
fun testDocument_Parse() {
val document = KtSoupDocument()
assertTrue(document.parse(SIMPLE_DOCUMENT))
fun testDocument_ParseAndClose() {
val document = KtSoupParser.parse(SIMPLE_DOCUMENT)
document.close()
assertFailsWith<IllegalStateException> {
document.title()
}
}

@Test
Expand Down Expand Up @@ -91,9 +93,7 @@ class KtSoupDocumentTests {
}

private fun withDocument(html: String, testBody: (document: KtSoupDocument) -> Unit) {
val document = KtSoupDocument()
assertTrue(document.parse(html))
document.use(testBody)
KtSoupParser.parse(html).use(testBody)
}

@Test
Expand Down
24 changes: 5 additions & 19 deletions ktsoup-core/src/jsMain/kotlin/ktsoup/KtSoupDocument.js.kt
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,10 @@
package ktsoup

import ktsoup.nodehtmlparser.HTMLElement
import ktsoup.nodehtmlparser.HTMLParser

public actual class KtSoupDocument actual constructor() {
private var document: HTMLElement? = null
public actual fun parse(html: String): Boolean {
document = HTMLParser.parse(html)
return document != null
}
public actual class KtSoupDocument internal constructor(
private var document: HTMLElement?,
) : KtSoupElement(document!!) {

public actual fun title(): String {
return getElementsByTagName("title").firstOrNull()?.textContent().orEmpty()
Expand All @@ -38,22 +34,12 @@ public actual class KtSoupDocument actual constructor() {
return getElementsByTagName("head").firstOrNull()
}

public actual fun querySelector(selector: String): KtSoupElement? {
return checkDocument().querySelector(selector)?.wrap() as? KtSoupElement
}

public actual fun querySelectorAll(selector: String): List<KtSoupElement> {
return checkDocument().querySelectorAll(selector)
.mapNotNull { it.wrap() as? KtSoupElement }
}

public actual fun getElementById(id: String): KtSoupElement? {
return checkDocument().getElementById(id)?.let { KtSoupElement(it) }
}

public actual fun getElementsByClass(className: String): List<KtSoupElement> {
return checkDocument().querySelectorAll(".$className")
.map { KtSoupElement(it.unsafeCast<HTMLElement>()) }
return querySelectorAll(".$className")
}

public actual fun getElementsByTagName(tagName: String): List<KtSoupElement> {
Expand All @@ -73,6 +59,6 @@ public actual class KtSoupDocument actual constructor() {
}

private fun checkDocument(): HTMLElement {
return checkNotNull(document) { ERROR_CALL_PARSE_FIRST }
return checkNotNull(document) { ERROR_DOCUMENT_CLOSED }
}
}
2 changes: 1 addition & 1 deletion ktsoup-core/src/jsMain/kotlin/ktsoup/KtSoupElement.js.kt
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ package ktsoup

import ktsoup.nodehtmlparser.HTMLElement

public actual class KtSoupElement internal constructor(
public actual open class KtSoupElement internal constructor(
private val element: HTMLElement,
) : KtSoupNode(element) {
public actual fun id(): String? {
Expand Down
29 changes: 29 additions & 0 deletions ktsoup-core/src/jsMain/kotlin/ktsoup/KtSoupParser.js.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
/**
* KtSoup
* Copyright (C) 2023 Drew Carlson
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ktsoup

import ktsoup.nodehtmlparser.HTMLParser

public actual object KtSoupParser {

public actual fun parse(html: String): KtSoupDocument {
val document = checkNotNull(HTMLParser.parse(html)) {
"Failed to parse HTML document"
}
return KtSoupDocument(document)
}
}
22 changes: 4 additions & 18 deletions ktsoup-core/src/jvmMain/kotlin/ktsoup/KtSoupDocument.jvm.kt
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,11 @@
*/
package ktsoup

import org.jsoup.Jsoup
import org.jsoup.nodes.Document

public actual class KtSoupDocument {

private var document: Document? = null

public actual fun parse(html: String): Boolean {
document = Jsoup.parse(html)
return true
}
public actual class KtSoupDocument internal constructor(
private var document: Document?,
) : KtSoupElement(document!!) {

public actual fun title(): String {
return checkDocument().title()
Expand All @@ -40,14 +34,6 @@ public actual class KtSoupDocument {
return KtSoupElement(checkDocument().head())
}

public actual fun querySelector(selector: String): KtSoupElement? {
return checkDocument().selectFirst(selector)?.let { KtSoupElement(it) }
}

public actual fun querySelectorAll(selector: String): List<KtSoupElement> {
return checkDocument().select(selector).map { KtSoupElement(it) }
}

public actual fun getElementById(id: String): KtSoupElement? {
return checkDocument().getElementById(id)?.let { KtSoupElement(it) }
}
Expand All @@ -73,6 +59,6 @@ public actual class KtSoupDocument {
}

private fun checkDocument(): Document {
return checkNotNull(document) { ERROR_CALL_PARSE_FIRST }
return checkNotNull(document) { ERROR_DOCUMENT_CLOSED }
}
}
2 changes: 1 addition & 1 deletion ktsoup-core/src/jvmMain/kotlin/ktsoup/KtSoupElement.jvm.kt
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ package ktsoup

import org.jsoup.nodes.Element

public actual class KtSoupElement internal constructor(
public actual open class KtSoupElement internal constructor(
private val element: Element,
) : KtSoupNode(element) {
public actual fun id(): String? {
Expand Down
26 changes: 26 additions & 0 deletions ktsoup-core/src/jvmMain/kotlin/ktsoup/KtSoupParser.jvm.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
/**
* KtSoup
* Copyright (C) 2023 Drew Carlson
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ktsoup

import org.jsoup.Jsoup

public actual object KtSoupParser {

public actual fun parse(html: String): KtSoupDocument {
return KtSoupDocument(Jsoup.parse(html))
}
}
24 changes: 5 additions & 19 deletions ktsoup-core/src/nativeMain/kotlin/ktsoup/KtSoupDocument.native.kt
Original file line number Diff line number Diff line change
Expand Up @@ -22,22 +22,14 @@ import platform.posix.size_tVar

private const val START_LIST_SIZE = 128uL

public actual class KtSoupDocument {

private var documentPointer: CPointer<lxb_html_document_t>? = null

public actual fun parse(html: String): Boolean = memScoped {
documentPointer = lxb_html_document_create()
lxb_html_document_parse(
checkDocument(),
html.cstr.ptr.reinterpret(),
html.length.convert(),
) == LXB_STATUS_OK
}
public actual class KtSoupDocument internal constructor(
private var documentPointer: CPointer<lxb_html_document_t>?,
) : KtSoupElement(documentPointer!!.reinterpret()) {

public actual fun close() {
documentPointer ?: return
lxb_html_document_destroy(checkDocument())
documentPointer = null
}

public actual fun title(): String = memScoped {
Expand All @@ -57,12 +49,6 @@ public actual class KtSoupDocument {
?.let { KtSoupElement(it.reinterpret()) }
}

public actual fun querySelector(selector: String): KtSoupElement? =
querySelectorAll(checkDocument().reinterpret(), selector, single = true).firstOrNull()

public actual fun querySelectorAll(selector: String): List<KtSoupElement> =
querySelectorAll(checkDocument().reinterpret(), selector, single = false)

public actual fun getElementById(id: String): KtSoupElement? = memScoped {
val idQuery = id.cstr
val attrQuery = "id".cstr
Expand Down Expand Up @@ -144,6 +130,6 @@ public actual class KtSoupDocument {
}

private fun checkDocument(): CPointer<lxb_html_document_t> {
return checkNotNull(documentPointer) { ERROR_CALL_PARSE_FIRST }
return checkNotNull(documentPointer) { ERROR_DOCUMENT_CLOSED }
}
}
Loading

0 comments on commit 1074e54

Please sign in to comment.