From 75a4d84cffca13e7c31764fe6ab24068624f68b6 Mon Sep 17 00:00:00 2001
From: Jason Lee <jason@zerodevx.com>
Date: Fri, 11 Feb 2022 17:21:47 +0800
Subject: [PATCH] Add `-m, --match <globs...>` option

---
 README.md    | 20 ++++++++++++++------
 package.json |  7 ++-----
 src/cli.js   |  1 +
 src/index.js | 24 +++++++++++++-----------
 4 files changed, 30 insertions(+), 22 deletions(-)
diff --git a/README.md b/README.md
index ff4fc43..753001e 100644
--- a/README.md
+++ b/README.md
@@ -43,6 +43,7 @@ CLI to generate XML sitemaps for static sites from local filesystem
 Options:
   -b, --base <url>                       base URL (required)
   -r, --root <dir>                       root working directory (default: ".")
+  -m, --match <glob...>                  globs to match (default: ["**/*.html"])
   -i, --ignore <glob...>                 globs to ignore (default: ["404.html"])
   -c, --changefreq <glob,changefreq...>  comma-separated glob-changefreq pairs
   -p, --priority <glob,priority...>      comma-separated glob-priority pairs
@@ -59,7 +60,7 @@ Options:
 
 #### HTML parsing
 
-By default, all matched files are piped through a fast
+By default, all matched `.html` files are piped through a fast
 [HTML parser](https://github.com/fb55/htmlparser2) to detect if the `noindex`
 [meta tag](https://developers.google.com/search/docs/advanced/crawling/block-indexing#meta-tag) is
 set - typically in the form of `<meta name="robots" content="noindex" />` - in which case that file
@@ -99,13 +100,13 @@ Disabled by default; pass option `--slash` to enable.
 [always added](https://github.com/zerodevx/static-sitemap-cli/tree/v1#to-slash-or-not-to-slash) to
 root domains.
 
-#### Ignore some files
+#### Match or ignore files
 
-The `-i` flag allows multiple entries. By default, it's set to the `["404.html"]`. Change the glob
-ignore patterns to suit your use-case like so:
+The `-m` and `-i` flags allow multiple entries. By default, they are set to the `["**/*.html"]` and
+`["404.html"]` respectively. Change the glob patterns to suit your use-case like so:
 
 ```
-$ sscli ... -i '404.html' '**/ignore/**' 'this/other/specific/file.html'
+$ sscli ... -m '**/*.{html,jpg,png}' -i '404.html' 'ignore/**' 'this/other/specific/file.html'
 ```
 
 #### Glob-[*] pairs
@@ -143,7 +144,13 @@ $ sscli -b https://x.com -r dist -f xml -o > www/sm.xml
 #### Get subset of a directory
 
 ```
-$ sscli -b https://x.com/foo -r dist/foo -f txt -o > dist/sitemap.txt
+$ sscli -b https://x.com/foo -r dist/foo -f xml -o > dist/sitemap.xml
+```
+
+#### Generate TXT sitemap for image assets
+
+```
+$ sscli -b https://x.com -r dist -m '**/*.{jpg,jpeg,gif,png,bmp,webp,svg}' -f txt
 ```
 
 ## Programmatic Use
@@ -160,6 +167,7 @@ import {
 const options = {
   base: 'https://x.com',
   root: 'path/to/root',
+  match: ['**/*html'],
   ignore: ['404.html'],
   changefreq: [],
   priority: [],
diff --git a/package.json b/package.json
index 63bafb1..9bdb54f 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "static-sitemap-cli",
-  "version": "2.0.1",
+  "version": "2.1.0",
   "description": "CLI to generate XML sitemaps for static sites from local filesystem",
   "author": "Jason Lee <jason@zerodevx.com>",
   "type": "module",
@@ -38,10 +38,7 @@
   ],
   "license": "ISC",
   "homepage": "https://npmjs.com/package/static-sitemap-cli",
-  "repository": {
-    "type": "git",
-    "url": "https://github.com/zerodevx/static-sitemap-cli.git"
-  },
+  "repository": "github:zerodevx/static-sitemap-cli",
   "keywords": [
     "sscli",
     "sitemap",
diff --git a/src/cli.js b/src/cli.js
index 93cd30b..8c45542 100644
--- a/src/cli.js
+++ b/src/cli.js
@@ -16,6 +16,7 @@ program
   .description('CLI to generate XML sitemaps for static sites from local filesystem')
   .option('-b, --base <url>', 'base URL (required)')
   .option('-r, --root <dir>', 'root working directory', '.')
+  .option('-m, --match <glob...>', 'globs to match', ['**/*.html'])
   .option('-i, --ignore <glob...>', 'globs to ignore', ['404.html'])
   .option('-c, --changefreq <glob,changefreq...>', 'comma-separated glob-changefreq pairs')
   .option('-p, --priority <glob,priority...>', 'comma-separated glob-priority pairs')
diff --git a/src/index.js b/src/index.js
index b24aae2..09a99be 100644
--- a/src/index.js
+++ b/src/index.js
@@ -10,8 +10,8 @@ function log(msg) {
   console.warn('\x1b[36m%s\x1b[0m', `[sscli] ${msg}`)
 }
 
-async function getFiles({ root, ignore, verbose }) {
-  const files = await fastglob('**/*.html', { cwd: root, stats: true, ignore })
+async function getFiles({ root, match, ignore, verbose }) {
+  const files = await fastglob(match, { cwd: root, stats: true, ignore })
   if (!files.length) {
     throw new Error('NO_MATCHES')
   }
@@ -43,16 +43,18 @@ function detectNoindex(path) {
 }
 
 async function transformUrl(
-  file,
+  { path, stats: { mtime } },
   { root, base, changefreq, priority, robots, clean, slash, verbose }
 ) {
-  if (robots) {
-    if (await detectNoindex(nodepath.join(root, file.path))) {
-      if (verbose) log(`noindex: ${file.path}`)
-      return
-    }
+  if (
+    robots &&
+    nodepath.extname(path) === '.html' &&
+    (await detectNoindex(nodepath.join(root, path)))
+  ) {
+    if (verbose) log(`noindex: ${path}`)
+    return
   }
-  let url = base + file.path.split(nodepath.sep).join('/')
+  let url = base + path.split(nodepath.sep).join('/')
   if (clean) {
     if (url.slice(-11) === '/index.html') url = url.slice(0, -11)
     else if (url.slice(-5) === '.html') url = url.slice(0, -5)
@@ -61,12 +63,12 @@ async function transformUrl(
   const check = (pairs, tagname) => {
     for (let a = pairs.length - 1; a >= 0; a--) {
       const p = pairs[a].split(',')
-      if (micromatch.isMatch(file.path, p[0])) return { [tagname]: p[1] }
+      if (micromatch.isMatch(path, p[0])) return { [tagname]: p[1] }
     }
   }
   return {
     loc: url,
-    lastmod: file.stats.mtime.toISOString(),
+    lastmod: mtime.toISOString(),
     ...(changefreq && changefreq.length && check(changefreq, 'changefreq')),
     ...(priority && priority.length && check(priority, 'priority'))
   }