Skip to content

Commit

Permalink
improved url extract
Browse files Browse the repository at this point in the history
  • Loading branch information
jakopako committed Feb 11, 2024
1 parent 25a282e commit 7819528
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 0 deletions.
10 changes: 10 additions & 0 deletions scraper/scraper.go
Original file line number Diff line number Diff line change
Expand Up @@ -793,6 +793,16 @@ func getURLString(e *ElementLocation, s *goquery.Selection, baseURL string) (str
} else if strings.HasPrefix(urlVal, "/") {
baseURL := fmt.Sprintf("%s://%s", u.Scheme, u.Host)
urlRes = fmt.Sprintf("%s%s", baseURL, urlVal)
} else if strings.HasPrefix(urlVal, "..") {
partsUrlVal := strings.Split(urlVal, "/")
partsPath := strings.Split(u.Path, "/")
i := 0
for ; i < len(partsUrlVal); i++ {
if partsUrlVal[i] != ".." {
break
}
}
urlRes = fmt.Sprintf("%s://%s%s/%s", u.Scheme, u.Host, strings.Join(partsPath[:len(partsPath)-i-1], "/"), strings.Join(partsUrlVal[i:], "/"))
} else {
idx := strings.LastIndex(u.Path, "/")
if idx > 0 {
Expand Down
36 changes: 36 additions & 0 deletions scraper/scraper_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,13 @@ const (
<span>29.02.</span><span>Heinz Rudolf Kunze &amp; Verstärkung
&#8211; ABGESAGT</span> </a>
</h2>`
htmlString6 = `
<h2>
<a href="../site/event/id/165"
title="Heinz Rudolf Kunze &amp; Verstärkung &#8211; ABGESAGT">
<span>29.02.</span><span>Heinz Rudolf Kunze &amp; Verstärkung
&#8211; ABGESAGT</span> </a>
</h2>`
)

func TestFilterItemMatchTrue(t *testing.T) {
Expand Down Expand Up @@ -480,6 +487,35 @@ func TestExtractFieldUrlFile(t *testing.T) {
}
}

func TestExtractFieldUrlParentDir(t *testing.T) {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlString6))
if err != nil {
t.Fatalf("unexpected error while reading html string: %v", err)
}
f := &Field{
Name: "url",
Type: "url",
ElementLocations: []ElementLocation{
{
Selector: "h2 > a",
},
},
}
event := map[string]interface{}{}
err = extractField(f, event, doc.Selection, "http://point11.ch/site/home")
if err != nil {
t.Fatalf("unexpected error while extracting the time field: %v", err)
}
if v, ok := event["url"]; !ok {
t.Fatal("event doesn't contain the expected url field")
} else {
expected := "http://point11.ch/site/event/id/165"
if v != expected {
t.Fatalf("expected '%s' for url but got '%s'", expected, v)
}
}
}

func TestExtractFieldDate(t *testing.T) {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlString))
if err != nil {
Expand Down

0 comments on commit 7819528

Please sign in to comment.