Skip to content

Commit

Permalink
feat: move to interfaces
Browse files Browse the repository at this point in the history
Signed-off-by: Felipe Zipitria <[email protected]>
  • Loading branch information
fzipi committed Sep 21, 2024
1 parent d2fba4c commit ceb3e46
Show file tree
Hide file tree
Showing 13 changed files with 414 additions and 119 deletions.
15 changes: 9 additions & 6 deletions cmd/quantitative.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,13 @@
package cmd

import (
"fmt"
"os"

"github.com/spf13/cobra"

"github.com/coreruleset/go-ftw/internal/quantitative"
"github.com/coreruleset/go-ftw/output"
"github.com/spf13/cobra"
"os"
)

// NewQuantitativeCmd
Expand All @@ -20,8 +23,6 @@ func NewQuantitativeCmd() *cobra.Command {
RunE: runQuantitativeE,
}

runCmd.Flags().BoolP("markdown", "m", false, "Markdown table output mode")
runCmd.Flags().IntP("sample", "s", 0, "Process every s-th line of input (s % of lines)")
runCmd.Flags().IntP("lines", "l", 0, "Number of lines of input to process before stopping")
runCmd.Flags().IntP("paranoia-level", "P", 1, "Paranoia level used to run the quantitative tests")
runCmd.Flags().IntP("corpus-line", "n", 0, "Number is the payload line from the corpus to exclusively send")
Expand Down Expand Up @@ -50,14 +51,17 @@ func runQuantitativeE(cmd *cobra.Command, _ []string) error {
directory, _ := cmd.Flags().GetString("directory")
fast, _ := cmd.Flags().GetInt("fast")
lines, _ := cmd.Flags().GetInt("lines")
markdown, _ := cmd.Flags().GetBool("markdown")
outputFilename, _ := cmd.Flags().GetString("file")
paranoiaLevel, _ := cmd.Flags().GetInt("paranoia-level")
payload, _ := cmd.Flags().GetString("payload")
number, _ := cmd.Flags().GetInt("number")
rule, _ := cmd.Flags().GetInt("rule")
wantedOutput, _ := cmd.Flags().GetString("output")

if paranoiaLevel > 1 && rule > 0 {
return fmt.Errorf("paranoia level and rule ID cannot be used together")
}

// use outputFile to write to file
var outputFile *os.File
var err error
Expand All @@ -80,7 +84,6 @@ func runQuantitativeE(cmd *cobra.Command, _ []string) error {
Directory: directory,
Fast: fast,
Lines: lines,
Markdown: markdown,
ParanoiaLevel: paranoiaLevel,
Number: number,
Payload: payload,
Expand Down
9 changes: 5 additions & 4 deletions cmd/quantitative_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,17 @@ package cmd

import (
"context"
"github.com/spf13/cobra"
"github.com/stretchr/testify/suite"
"io/fs"
"os"
"path"
"testing"

"github.com/spf13/cobra"
"github.com/stretchr/testify/suite"
)

var crsSetupFileContents = `# CRS Setup Configuration File`
var emptyRulesFile = `# Empty Rules File`
var crsSetupFileContents = `# CRS Setup Configuration filename`
var emptyRulesFile = `# Empty Rules filename`

type quantitativeCmdTestSuite struct {
suite.Suite
Expand Down
46 changes: 34 additions & 12 deletions experimental/corpus/types.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
// Copyright 2024 OWASP CRS Project
// SPDX-License-Identifier: Apache-2.0

// Package corpus provides functionality for creating and managing corpora.
//
// A corpus is a collection of text documents that are used for training and testing machine learning models.
Expand All @@ -15,12 +18,27 @@
// interface is subject to change.
package corpus

// CorpusFile contains the cache directory and file name
type CorpusFile struct {
// Define an enum for CorpusType
type Type string

const (
Leipzig Type = "leipzig"
)

// File interface is used to interact with Corpus files.
// It provides methods for setting the cache directory and file path.
type File interface {
// CacheDir is the directory where files are cached
CacheDir string
CacheDir() string

// FilePath is the path to the cached file
FilePath string
FilePath() string

// WithCacheDir sets the cache directory
WithCacheDir(cacheDir string) File

// WithFilePath sets the file path
WithFilePath(filePath string) File
}

// Corpus is the interface that must be implemented to make a corpus available to clients
Expand All @@ -32,24 +50,21 @@ type Corpus interface {
WithURL(url string) Corpus

// FetchCorpusFile fetches the corpus file from the remote URL and returns a CorpusFile for interaction with the file.
FetchCorpusFile() CorpusFile
FetchCorpusFile() File

// GetIterator returns an iterator for the corpus
GetIterator(c CorpusFile) Iterator

// GetPayload returns the payload given a line from the Corpus Iterator
GetPayload(line string) string
GetIterator(c File) Iterator

// Size returns the size of the corpus
Size() string

// WithSize sets the size of the corpus
// Most corpora will have a sizes like "100K", "1M", etc., related to the amount of sentences in the corpus
WithSize(size string) Corpus

// Year returns the year of the corpus
Year() string

// WithYear sets the year of the corpus
// Most corpora will have a year like "2023", "2022", etc.
WithYear(year string) Corpus
Expand All @@ -72,8 +87,15 @@ type Corpus interface {
// Iterator is an interface for iterating over a corpus
type Iterator interface {
// Next returns the next sentence from the corpus
Next() string
Next() Payload
// HasNext returns true unless the end of the corpus has been reached
// false otherwise
HasNext() bool
}

type Payload interface {
// LineNumber returns the payload given a line from the Corpus Iterator
LineNumber() int
// Content returns the payload given a line from the Corpus Iterator
Content() string
}
57 changes: 26 additions & 31 deletions internal/quantitative/leipzig/corpus.go
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
// Copyright 2024 OWASP CRS Project
// SPDX-License-Identifier: Apache-2.0

package leipzig

import (
"bufio"
"fmt"
"github.com/coreruleset/go-ftw/experimental/corpus"
"github.com/hashicorp/go-getter"
"github.com/rs/zerolog/log"
"os"
"path"
"path/filepath"
"strings"

"github.com/hashicorp/go-getter"
"github.com/rs/zerolog/log"

"github.com/coreruleset/go-ftw/experimental/corpus"
)

// LeipzigCorpus represents a corpus of text data
Expand All @@ -33,8 +37,8 @@ type LeipzigCorpus struct {
lang string
// corpusFile is the original file name that contains the corpus file
corpusFile string
// File is the file name of the corpus
Filename string
// filename is the file name of the corpus
filename string
// size is the size of the corpus
size string
// source is the source of the corpus
Expand All @@ -47,7 +51,7 @@ func (c *LeipzigCorpus) regenerateFileNames() {
c.corpusFile = fmt.Sprintf("%s_%s_%s_%s.%s",
c.lang, c.source, c.year, c.size,
defaultCorpusExt)
c.File = fmt.Sprintf("%s_%s_%s_%s-%s",
c.filename = fmt.Sprintf("%s_%s_%s_%s-%s",
c.lang, c.source, c.year, c.size,
defaultCorpusType)
}
Expand All @@ -57,7 +61,7 @@ func NewLeipzigCorpus() corpus.Corpus {
leipzig := &LeipzigCorpus{
url_: defaultCorpusSite,
corpusFile: "",
File: "",
filename: "",
lang: defaultCorpusLanguage,
source: defaultCorpusSource,
year: defaultCorpusYear,
Expand Down Expand Up @@ -115,7 +119,7 @@ func (c *LeipzigCorpus) WithSource(source string) corpus.Corpus {
}

// Lang returns the language of the corpus
func (c *LeipzigCorpus) Lang() string {
func (c *LeipzigCorpus) Language() string {
return c.lang
}

Expand All @@ -126,14 +130,15 @@ func (c *LeipzigCorpus) WithLanguage(lang string) corpus.Corpus {
}

// GetIterator returns an iterator for the corpus
func (c *LeipzigCorpus) GetIterator(cache corpus.CorpusFile) corpus.Iterator {
func (c *LeipzigCorpus) GetIterator(cache corpus.File) corpus.Iterator {
// open cache file
if cache.FilePath == "" {
cached := cache.FilePath()
if cached == "" {
log.Fatal().Msg("Cache file path is empty")
}
file, err := os.Open(cache.FilePath)
file, err := os.Open(cached)
if err != nil {
log.Fatal().Err(err).Msgf("Could not open the file %s", cache.FilePath)
log.Fatal().Err(err).Msgf("Could not open the file %s", cached)
}
scanner := bufio.NewScanner(file)
it := &LeipzigIterator{
Expand All @@ -142,16 +147,9 @@ func (c *LeipzigCorpus) GetIterator(cache corpus.CorpusFile) corpus.Iterator {
return it
}

// GetPayload returns the payload from the line
// We assume that the first word is the line number,
// and we want the rest
func (c *LeipzigCorpus) GetPayload(line string) string {
return strings.Join(strings.Split(line, "\t")[1:], " ")
}

// GetCorpusFile gets the file from the remote url.
// FetchCorpusFile gets the file from the remote url.
// We assume that the file is compressed somehow, and we want to get a file from the container.
func (c *LeipzigCorpus) GetCorpusFile() corpus.CorpusFile {
func (c *LeipzigCorpus) FetchCorpusFile() corpus.File {
home, err := os.UserHomeDir()
if err != nil {
log.Fatal().Err(err).Msg("Could not get home directory")
Expand All @@ -167,14 +165,11 @@ func (c *LeipzigCorpus) GetCorpusFile() corpus.CorpusFile {
log.Fatal().Err(err).Msg("Could not create destination directory")
}

cache := corpus.CorpusFile{
CacheDir: cacheDir,
FilePath: "",
}
cache := NewFile().WithCacheDir(cacheDir)

if info, err := os.Stat(path.Join(home, ".ftw", c.File)); err == nil {
log.Debug().Msgf("File %s already exists", info.Name())
cache.FilePath = path.Join(home, ".ftw", c.File)
if info, err := os.Stat(path.Join(home, ".ftw", cache.FilePath())); err == nil {
log.Debug().Msgf("filename %s already exists", info.Name())
cache = cache.WithFilePath(path.Join(home, ".ftw", c.filename))
return cache
}

Expand Down Expand Up @@ -202,15 +197,15 @@ func (c *LeipzigCorpus) GetCorpusFile() corpus.CorpusFile {

log.Trace().Msgf("Checking file %s", info.Name())

if info.Name() == c.File {
if info.Name() == c.filename {
newPath := filepath.Join(cacheDir, info.Name())
err = os.Rename(path, newPath)
if err != nil {
fmt.Println("Error moving:", err)
return err
}
fmt.Println("Moved", path, "to", newPath)
cache.FilePath = newPath
cache = cache.WithFilePath(newPath)
}

return nil
Expand Down
28 changes: 14 additions & 14 deletions internal/quantitative/leipzig/corpus_test.go
Original file line number Diff line number Diff line change
@@ -1,15 +1,20 @@
// Copyright 2024 OWASP CRS Project
// SPDX-License-Identifier: Apache-2.0

package leipzig

import (
"github.com/coreruleset/go-ftw/experimental/corpus"
"github.com/stretchr/testify/suite"
"testing"

"github.com/stretchr/testify/suite"

"github.com/coreruleset/go-ftw/experimental/corpus"
)

type leipzigCorpusTestSuite struct {
suite.Suite
corpus corpus.Corpus
cache corpus.CorpusFile
cache corpus.File
iter corpus.Iterator
}

Expand All @@ -20,7 +25,7 @@ func TestLeipzigCorpusTestSuite(t *testing.T) {
func (s *leipzigCorpusTestSuite) SetupTest() {
s.corpus = NewLeipzigCorpus()
s.Require().Equal("https://downloads.wortschatz-leipzig.de/corpora", s.corpus.URL())
s.Require().Equal("eng", s.corpus.Lang())
s.Require().Equal("eng", s.corpus.Language())
s.Require().Equal("100K", s.corpus.Size())
s.Require().Equal("news", s.corpus.Source())
s.Require().Equal("2023", s.corpus.Year())
Expand All @@ -33,20 +38,15 @@ func (s *leipzigCorpusTestSuite) TestWithSize() {

func (s *leipzigCorpusTestSuite) TestGetIterator() {
s.corpus.WithSize("10K")
s.cache = s.corpus.GetCorpusFile()
s.cache = s.corpus.FetchCorpusFile()
s.iter = s.corpus.GetIterator(s.cache)
}

func (s *leipzigCorpusTestSuite) TestNextSentenceFromCorpus() {
s.cache = s.corpus.GetCorpusFile()
s.iter = s.corpus.GetIterator(s.cache)
s.Require().True(s.iter.HasNext())
s.Require().Equal("1\t$156,834 for The Pathway to Excellence in Practice program through Neighborhood Place of Puna.", s.iter.Next())
}

func (s *leipzigCorpusTestSuite) TestGetPayloadFromString() {
s.cache = s.corpus.GetCorpusFile()
s.cache = s.corpus.FetchCorpusFile()
s.iter = s.corpus.GetIterator(s.cache)
s.Require().True(s.iter.HasNext())
s.Require().Equal("1\t$156,834 for The Pathway to Excellence in Practice program through Neighborhood Place of Puna.", s.iter.Next())
payload := s.iter.Next()
s.Require().Equal(1, payload.LineNumber())
s.Require().Equal("$156,834 for The Pathway to Excellence in Practice program through Neighborhood Place of Puna.", payload.Content())
}
39 changes: 39 additions & 0 deletions internal/quantitative/leipzig/file.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
// Copyright 2024 OWASP CRS Project
// SPDX-License-Identifier: Apache-2.0

package leipzig

import "github.com/coreruleset/go-ftw/experimental/corpus"

// File implements the corpus.File interface.
type File struct {
cacheDir string
filePath string
}

// NewFile returns a new File
func NewFile() corpus.File {
return File{}
}

// CacheDir is the directory where files are cached
func (f File) CacheDir() string {
return f.cacheDir
}

// FilePath is the path to the cached file
func (f File) FilePath() string {
return f.filePath
}

// WithCacheDir sets the cache directory
func (f File) WithCacheDir(cacheDir string) corpus.File {
f.cacheDir = cacheDir
return f
}

// WithFilePath sets the file path
func (f File) WithFilePath(filePath string) corpus.File {
f.filePath = filePath
return f
}
Loading

0 comments on commit ceb3e46

Please sign in to comment.