Skip to content

Commit

Permalink
Implement download resume
Browse files Browse the repository at this point in the history
  • Loading branch information
alexferrari88 committed Jan 23, 2024
1 parent 2e50d4d commit 5a0baaf
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 4 deletions.
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ Use "sbstck-dl [command] --help" for more information about a command.

You can provide the url of a single post or the main url of the Substack you want to download.

By providing the main URL of a Substack, the downloader will download all the posts of the archive.

When downloading the full archive, if the downloader is interrupted, at the next execution it will resume the download of the remaining posts.

```bash
Usage:
sbstck-dl download [flags]
Expand Down Expand Up @@ -104,7 +108,6 @@ sbstck-dl download --url https://example.substack.com --cookie_name substack.sid
## TODO
- [ ] Implementing resuming downloads
- [ ] Improve retry logic
- [ ] Implement loading from config file
- [ ] Add support for downloading media
Expand All @@ -113,3 +116,4 @@ sbstck-dl download --url https://example.substack.com --cookie_name substack.sid
- [x] Add documentation
- [x] Add support for private newsletters
- [x] Implement filtering by date
- [x] Implement resuming downloads
49 changes: 47 additions & 2 deletions cmd/download.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"fmt"
"log"
"net/url"
"path/filepath"
"strings"
"time"

Expand Down Expand Up @@ -62,17 +63,36 @@ var (
var downloadedPostsCount int
dateFilterfunc := makeDateFilterFunc(beforeDate, afterDate)
urls, err := extractor.GetAllPostsURLs(ctx, downloadUrl, dateFilterfunc)
urlsCount := len(urls)
if err != nil {
log.Fatalln(err)
}
if urlsCount == 0 {
if verbose {
fmt.Println("No posts found, exiting...")
}
return
}
if verbose {
fmt.Printf("Found %d posts\n", len(urls))
fmt.Printf("Found %d posts\n", urlsCount)
}
if dryRun {
fmt.Printf("Found %d posts\n", len(urls))
fmt.Printf("Found %d posts\n", urlsCount)
fmt.Println("Dry run, exiting...")
return
}
urls, err = filterExistingPosts(urls, outputFolder, format)
if err != nil {
if verbose {
fmt.Println("Error filtering existing posts:", err)
}
}
if len(urls) == 0 {
if verbose {
fmt.Println("No new posts found, exiting...")
}
return
}
bar := progressbar.NewOptions(len(urls),
progressbar.OptionSetWidth(25),
progressbar.OptionSetDescription("downloading"),
Expand Down Expand Up @@ -154,3 +174,28 @@ func parseURL(toTest string) (*url.URL, error) {
func makePath(post lib.Post, outputFolder string, format string) string {
return fmt.Sprintf("%s/%s_%s.%s", outputFolder, convertDateTime(post.PostDate), post.Slug, format)
}

// extractSlug extracts the slug from a Substack post URL
// e.g. https://example.substack.com/p/this-is-the-post-title -> this-is-the-post-title
func extractSlug(url string) string {
split := strings.Split(url, "/")
return split[len(split)-1]
}

// filterExistingPosts filters out posts that already exist in the output folder.
// It looks for files whose name ends with the post slug.
func filterExistingPosts(urls []string, outputFolder string, format string) ([]string, error) {
var filtered []string
for _, url := range urls {
slug := extractSlug(url)
path := fmt.Sprintf("%s/%s_%s.%s", outputFolder, "*", slug, format)
matches, err := filepath.Glob(path)
if err != nil {
return urls, err
}
if len(matches) == 0 {
filtered = append(filtered, url)
}
}
return filtered, nil
}
2 changes: 1 addition & 1 deletion cmd/version.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ var versionCmd = &cobra.Command{
Short: "Print the version number of sbstck-dl",
Long: `Display the current version of the app.`,
Run: func(cmd *cobra.Command, args []string) {
fmt.Println("sbstck-dl v0.3.1")
fmt.Println("sbstck-dl v0.3.2")
},
}

Expand Down

0 comments on commit 5a0baaf

Please sign in to comment.