Skip to content

Commit

Permalink
allow regex for url extaction as well
Browse files Browse the repository at this point in the history
  • Loading branch information
jakopako committed Nov 18, 2023
1 parent e639a63 commit 06b6220
Showing 1 changed file with 31 additions and 34 deletions.
65 changes: 31 additions & 34 deletions scraper/scraper.go
Original file line number Diff line number Diff line change
Expand Up @@ -457,7 +457,10 @@ func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl
}
} else {
baseUrl := getBaseURL(currentPageUrl, doc)
nextPageUrl := getURLString(&c.Paginator.Location, doc.Selection, baseUrl)
nextPageUrl, err := getURLString(&c.Paginator.Location, doc.Selection, baseUrl)
if err != nil {
return false, "", nil, err
}
if nextPageUrl != "" {
nextPageDoc, err := fetchToDoc(nextPageUrl, c.fetcher, fetch.FetchOpts{})
if err != nil {
Expand Down Expand Up @@ -503,7 +506,10 @@ func extractField(field *Field, event map[string]interface{}, s *goquery.Selecti
if len(field.ElementLocations) != 1 {
return fmt.Errorf("a field of type 'url' must exactly have one location")
}
url := getURLString(&field.ElementLocations[0], s, baseURL)
url, err := getURLString(&field.ElementLocations[0], s, baseURL)
if err != nil {
return err
}
if url == "" {
url = baseURL
}
Expand Down Expand Up @@ -681,31 +687,22 @@ func getRawDateComponents(f *Field, s *goquery.Selection) (map[string]string, er
return rawComponents, nil
}

func getURLString(e *ElementLocation, s *goquery.Selection, baseURL string) string {
func getURLString(e *ElementLocation, s *goquery.Selection, baseURL string) (string, error) {
var urlVal, urlRes string
u, _ := url.Parse(baseURL)
if e.Attr == "" {
// set attr to the default if not set
e.Attr = "href"
}
if e.Selector == "" {
urlVal = s.AttrOr(e.Attr, "")
} else {
fieldSelection := s.Find(e.Selector)
if len(fieldSelection.Nodes) > e.NodeIndex {
fieldNode := fieldSelection.Get(e.NodeIndex)
for _, a := range fieldNode.Attr {
if a.Key == e.Attr {
urlVal = a.Val
break
}
}
}

urlVal, err := getTextString(e, s)
if err != nil {
return "", err
}

urlVal = strings.TrimSpace(urlVal)
if urlVal == "" {
return ""
return "", nil
} else if strings.HasPrefix(urlVal, "http") {
urlRes = urlVal
} else if strings.HasPrefix(urlVal, "?") || strings.HasPrefix(urlVal, ".?") {
Expand All @@ -725,20 +722,20 @@ func getURLString(e *ElementLocation, s *goquery.Selection, baseURL string) stri
}

urlRes = strings.TrimSpace(urlRes)
return urlRes
return urlRes, nil
}

func getTextString(t *ElementLocation, s *goquery.Selection) (string, error) {
func getTextString(e *ElementLocation, s *goquery.Selection) (string, error) {
var fieldStrings []string
var fieldSelection *goquery.Selection
if t.Selector == "" {
if e.Selector == "" {
fieldSelection = s
} else {
fieldSelection = s.Find(t.Selector)
fieldSelection = s.Find(e.Selector)
}
if len(fieldSelection.Nodes) > t.NodeIndex {
if t.Attr == "" {
if t.EntireSubtree {
if len(fieldSelection.Nodes) > e.NodeIndex {
if e.Attr == "" {
if e.EntireSubtree {
// copied from https://github.com/PuerkitoBio/goquery/blob/v1.8.0/property.go#L62
var buf bytes.Buffer
var f func(*html.Node)
Expand All @@ -753,36 +750,36 @@ func getTextString(t *ElementLocation, s *goquery.Selection) (string, error) {
}
}
}
if t.AllNodes {
if e.AllNodes {
for _, node := range fieldSelection.Nodes {
f(node)
fieldStrings = append(fieldStrings, buf.String())
buf.Reset()
}
} else {
f(fieldSelection.Get(t.NodeIndex))
f(fieldSelection.Get(e.NodeIndex))
fieldStrings = append(fieldStrings, buf.String())
}
} else {

var fieldNodes []*html.Node
if t.AllNodes {
if e.AllNodes {
for _, node := range fieldSelection.Nodes {
fieldNode := node.FirstChild
if fieldNode != nil {
fieldNodes = append(fieldNodes, fieldNode)
}
}
} else {
fieldNode := fieldSelection.Get(t.NodeIndex).FirstChild
fieldNode := fieldSelection.Get(e.NodeIndex).FirstChild
if fieldNode != nil {
fieldNodes = append(fieldNodes, fieldNode)
}
}
for _, fieldNode := range fieldNodes {
currentChildIndex := 0
for fieldNode != nil {
if currentChildIndex == t.ChildIndex {
if currentChildIndex == e.ChildIndex {
if fieldNode.Type == html.TextNode {
fieldStrings = append(fieldStrings, fieldNode.Data)
break
Expand All @@ -797,12 +794,12 @@ func getTextString(t *ElementLocation, s *goquery.Selection) (string, error) {
// WRONG
// It could be the case that there are multiple nodes that match the selector
// and we don't want the attr of the first node...
fieldStrings = append(fieldStrings, fieldSelection.AttrOr(t.Attr, ""))
fieldStrings = append(fieldStrings, fieldSelection.AttrOr(e.Attr, ""))
}
}
// do json lookup if we have a json_selector
for i, f := range fieldStrings {
fieldString, err := extractJsonField(t.JsonSelector, f)
fieldString, err := extractJsonField(e.JsonSelector, f)
if err != nil {
return "", err
}
Expand All @@ -814,17 +811,17 @@ func getTextString(t *ElementLocation, s *goquery.Selection) (string, error) {
}
// regex extract
for i, f := range fieldStrings {
fieldString, err := extractStringRegex(&t.RegexExtract, f)
fieldString, err := extractStringRegex(&e.RegexExtract, f)
if err != nil {
return "", err
}
fieldStrings[i] = fieldString
}
// shortening
for i, f := range fieldStrings {
fieldStrings[i] = utils.ShortenString(f, t.MaxLength)
fieldStrings[i] = utils.ShortenString(f, e.MaxLength)
}
return strings.Join(fieldStrings, t.Separator), nil
return strings.Join(fieldStrings, e.Separator), nil
}

func extractStringRegex(rc *RegexConfig, s string) (string, error) {
Expand Down

0 comments on commit 06b6220

Please sign in to comment.