From 810db61962f51145a98441ca13a1a5f7bd676771 Mon Sep 17 00:00:00 2001 From: Justyn Hunter Date: Tue, 7 Nov 2023 15:30:59 -0600 Subject: [PATCH 1/2] adds methods to set the IContentParser --- WebReaper/Builders/ScraperEngineBuilder.cs | 21 ++++++++++++++------- WebReaper/Builders/SpiderBuilder.cs | 10 ++++++++-- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/WebReaper/Builders/ScraperEngineBuilder.cs b/WebReaper/Builders/ScraperEngineBuilder.cs index fc36db5..418644d 100644 --- a/WebReaper/Builders/ScraperEngineBuilder.cs +++ b/WebReaper/Builders/ScraperEngineBuilder.cs @@ -8,6 +8,7 @@ using WebReaper.Core.CookieStorage.Abstract; using WebReaper.Core.LinkTracker.Abstract; using WebReaper.Core.LinkTracker.Concrete; +using WebReaper.Core.Parser.Abstract; using WebReaper.Core.Scheduler.Abstract; using WebReaper.Core.Scheduler.Concrete; using WebReaper.Domain; @@ -35,9 +36,15 @@ public class ScraperEngineBuilder private IScheduler Scheduler { get; set; } = new InMemoryScheduler(); private IScraperConfigStorage? ConfigStorage { get; set; } = new InMemoryScraperConfigStorage(); - + protected IProxyProvider? ProxyProvider { get; set; } + public ScraperEngineBuilder WithContentParser(IContentParser contentParser) + { + SpiderBuilder.WithContentParser(contentParser); + return this; + } + public ScraperEngineBuilder AddSink(IScraperSink sink) { SpiderBuilder.AddSink(sink); @@ -186,7 +193,7 @@ public ScraperEngineBuilder GetWithBrowser( ConfigBuilder.GetWithBrowser(startUrls, actionBuilder?.Invoke(new PageActionBuilder())); return this; } - + public ScraperEngineBuilder GetWithBrowser(params string[] startUrls) { ConfigBuilder.GetWithBrowser(startUrls); @@ -201,7 +208,7 @@ public ScraperEngineBuilder Follow(string linkSelector) public ScraperEngineBuilder FollowWithBrowser( string linkSelector, - Func>? actionBuilder = null) { ConfigBuilder.FollowWithBrowser(linkSelector, actionBuilder?.Invoke(new PageActionBuilder())); @@ -278,7 +285,7 @@ public ScraperEngineBuilder WithMongoDbCookieStorage(string connectionString, st logger); return this; } - + public ScraperEngineBuilder WithFileCookieStorage(string fileName) { SpiderBuilder.WithFileCookieStorage(fileName); @@ -335,12 +342,12 @@ public ScraperEngineBuilder WithParallelismDegree(int parallelismDegree) public async Task BuildAsync() { SpiderBuilder.WithConfigStorage(ConfigStorage); - + var config = ConfigBuilder.Build(); var spider = SpiderBuilder.Build(); - + await ConfigStorage.CreateConfigAsync(config); return new ScraperEngine(_parallelismDegree, ConfigStorage, Scheduler, spider, Logger); } -} \ No newline at end of file +} diff --git a/WebReaper/Builders/SpiderBuilder.cs b/WebReaper/Builders/SpiderBuilder.cs index c93700f..322544d 100644 --- a/WebReaper/Builders/SpiderBuilder.cs +++ b/WebReaper/Builders/SpiderBuilder.cs @@ -51,6 +51,12 @@ public class SpiderBuilder protected event Action ScrapedData; + public SpiderBuilder WithContentParser(IContentParser contentParser) + { + ContentParser = contentParser; + return this; + } + public SpiderBuilder WithLogger(ILogger logger) { Logger = logger; @@ -166,7 +172,7 @@ public SpiderBuilder WithRedisCookieStorage(string connectionString, string redi CookieStorage = new RedisCookieStorage(connectionString, redisKey, Logger); return this; } - + public SpiderBuilder WithFileCookieStorage(string fileName) { CookieStorage = new FileCookieStorage(fileName, Logger); @@ -235,4 +241,4 @@ public ISpider Build() return spider; } -} \ No newline at end of file +} From 0f84f7255063245cc3cda3b862e6acd1f9a9a936 Mon Sep 17 00:00:00 2001 From: Justyn Hunter Date: Tue, 7 Nov 2023 16:15:29 -0600 Subject: [PATCH 2/2] undo whitespace changes --- WebReaper/Builders/ScraperEngineBuilder.cs | 5 ----- 1 file changed, 5 deletions(-) diff --git a/WebReaper/Builders/ScraperEngineBuilder.cs b/WebReaper/Builders/ScraperEngineBuilder.cs index 418644d..e4105c9 100644 --- a/WebReaper/Builders/ScraperEngineBuilder.cs +++ b/WebReaper/Builders/ScraperEngineBuilder.cs @@ -36,7 +36,6 @@ public class ScraperEngineBuilder private IScheduler Scheduler { get; set; } = new InMemoryScheduler(); private IScraperConfigStorage? ConfigStorage { get; set; } = new InMemoryScraperConfigStorage(); - protected IProxyProvider? ProxyProvider { get; set; } public ScraperEngineBuilder WithContentParser(IContentParser contentParser) @@ -193,7 +192,6 @@ public ScraperEngineBuilder GetWithBrowser( ConfigBuilder.GetWithBrowser(startUrls, actionBuilder?.Invoke(new PageActionBuilder())); return this; } - public ScraperEngineBuilder GetWithBrowser(params string[] startUrls) { ConfigBuilder.GetWithBrowser(startUrls); @@ -285,7 +283,6 @@ public ScraperEngineBuilder WithMongoDbCookieStorage(string connectionString, st logger); return this; } - public ScraperEngineBuilder WithFileCookieStorage(string fileName) { SpiderBuilder.WithFileCookieStorage(fileName); @@ -342,10 +339,8 @@ public ScraperEngineBuilder WithParallelismDegree(int parallelismDegree) public async Task BuildAsync() { SpiderBuilder.WithConfigStorage(ConfigStorage); - var config = ConfigBuilder.Build(); var spider = SpiderBuilder.Build(); - await ConfigStorage.CreateConfigAsync(config); return new ScraperEngine(_parallelismDegree, ConfigStorage, Scheduler, spider, Logger);