Skip to content

Commit

Permalink
MarkdownDB: extracting links plus refactor (#472)
Browse files Browse the repository at this point in the history
Closes #473 

## Motivation
As we plan to replace Contentlayer with our MarkdownDB package, we need a solid base upon which we can build a full-fledged "content-layer" solution. As the code in its current state is pretty hard to reason about and manipulate and is rather weakly typed, the development of any new functionality on top of it would only make it harder and harder to engineer, adjust and test. This PR aims to make the development of new features easier and (because of stronger typing) less error-prone.

Note, there is still quite a lot of room for improvement - TODOs comments were left in the code but will probably be resolved after further discussions on implementation design.

## Changes
- `markdowndb.ts` cleaned up and refactored. Most notably:
  - tables creation (and deletion) logic moved out to `schema.ts` and bound with relevant classes corresponding to tables (see below)
  - wrapped raw batch inserts on knex db with class methods to prevent SQL errors in the first place, instead of debugging them after they occur
  - `indexFolder` bound with MarkdownDB class as a method
  - removed duplication of db initialization -> there is only one `init` method on `MarkdownDB` now
  - replaced ambiguous `query` method with `getFiles`
    - commented out querying by folder (?) for now as I'm not sure if it's even needed, at least not in the current form
  - added `getFileByUrl` and `getFileById` method to retrieve single file file from the db (e.g. for finding its backlinks)
  - added `getLinks` method which supports querying back- or forward links
  - removed `types.ts` classes in `schema.ts` can also by used as types. Other types like e.g. `DatabaseQuery` moved directly to method signature, as they are striclty bound with it and it's not needed to have it in a separate file (there is no other use case for it)
  - cleaned up and extended tests of `markdowndb` lib and split into smaller unit tests grouped by tested functionalities
- `schema.ts` created with four classes: `File`, `Link`, `Tag` and `FileTag`
  - each class is now a representation of a table, i.e. `files`, `links`, `tags`, `file_tags` and so it describes the fields (columns) existent on each table and has methods for creating a table, deleting it and batch-inserting data to it
  - it is a single source of truth about tables and data types stored in them,
- `utils` folder created with `recursiveWalkDir.ts`, `parseFile.ts` and `extractWikiLinks.ts` each with its own, separate test suite 
- stronger typing
  • Loading branch information
olayway committed Apr 4, 2023
1 parent 4d9289a commit ad65355
Show file tree
Hide file tree
Showing 30 changed files with 922 additions and 377 deletions.
6 changes: 6 additions & 0 deletions packages/markdowndb/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# @flowershow/markdowndb

## 0.0.3

### Patch Changes

- 135a238: clean-up and reafctoring with more OOP approach and stronger typing.

## 0.0.2

### Patch Changes
Expand Down
2 changes: 2 additions & 0 deletions packages/markdowndb/__mocks__/content/blog/blog1.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@ title: My Test Mdx Blog 1
---

# My Test Mdx Blog 1

[[Blog2]]
3 changes: 3 additions & 0 deletions packages/markdowndb/__mocks__/content/blog/blog2.mdx
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
---
title: My Test Mdx Blog 2
type: blog
tags:
- economy
---

# My Test Mdx Blog 2

[[../Blog0]]
3 changes: 3 additions & 0 deletions packages/markdowndb/__mocks__/content/blog/blog3.mdx
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
---
title: My Test Mdx Blog 2
type: blog
tags:
- politics
- economy
---

# My Test Mdx Blog 2

[[/blog/Blog1]]
2 changes: 2 additions & 0 deletions packages/markdowndb/__mocks__/content/blog0.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@ type: blog
---

# My Test Mdx Blog 1

[[blog/Blog2]]
6 changes: 6 additions & 0 deletions packages/markdowndb/__mocks__/content/news/index.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
type: news
---

# Document Title

7 changes: 7 additions & 0 deletions packages/markdowndb/__mocks__/content/news/news1.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
type: news
tags:
- culture
---

# Document Title
8 changes: 8 additions & 0 deletions packages/markdowndb/__mocks__/content/news/news2.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
type: news
tags:
- sports
---

# Document Title

18 changes: 8 additions & 10 deletions packages/markdowndb/jest.config.ts
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
/* eslint-disable */
export default {
import type { JestConfigWithTsJest } from "ts-jest";

const jestConfig: JestConfigWithTsJest = {
displayName: "markdowndb",
preset: "../../jest.preset.js",
globals: {
"ts-jest": {
tsconfig: "<rootDir>/tsconfig.spec.json",
},
},
testEnvironment: "node",
transform: {
"^.+\\.[tj]sx?$": "ts-jest",
"^.+\\.[tj]s?$": "ts-jest",
},
moduleFileExtensions: ["ts", "tsx", "js", "jsx"],
coverageDirectory: "../../coverage/packages/markdowndb",
transformIgnorePatterns: ["<rootDir>/node_modules/(?!remark-parse)"],
moduleFileExtensions: ["ts", "js"],
};

export default jestConfig;
5 changes: 4 additions & 1 deletion packages/markdowndb/package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@flowershow/markdowndb",
"version": "0.0.2",
"version": "0.0.3",
"description": "Parse markdown files and store them in an SQL database.",
"repository": {
"type": "git",
Expand All @@ -27,5 +27,8 @@
"gray-matter": "^4.0.3",
"knex": "^2.4.2",
"sqlite3": "^5.1.6"
},
"devDependencies": {
"remark-gfm": "^3.0.1"
}
}
229 changes: 157 additions & 72 deletions packages/markdowndb/src/lib/markdowndb.spec.ts
Original file line number Diff line number Diff line change
@@ -1,101 +1,186 @@
import knex from "knex";
import * as markdowndb from "./markdowndb";
import * as fs from "fs";
// import knex from "knex";
import { MarkdownDB } from "./markdowndb";
import { Table } from "./schema";
import { recursiveWalkDir } from "../utils";

/**
* @jest-environment node
*/
describe("MarkdownDB lib", () => {
it("builds a new MarkdownDB", async () => {
const pathToFixturesFolder = "packages/markdowndb/__mocks__/content";

// TODO test index files
describe("MarkdownDB", () => {
const pathToContentFixture = "packages/markdowndb/__mocks__/content";
let mddb: MarkdownDB;

beforeAll(async () => {
const dbConfig = {
client: "sqlite3",
connection: {
filename: "markdown.db",
},
};

const db = knex(dbConfig);
mddb = new MarkdownDB(dbConfig);
await mddb.init();
await mddb.indexFolder({ folderPath: pathToContentFixture });
});

// Index folder
await markdowndb.indexFolder("markdown.db", pathToFixturesFolder);
afterAll(async () => {
// TODO why we have to call this twice?
mddb.db.destroy();
mddb._destroyDb();
});

// Ensure there is a "files" table
expect(await db.schema.hasTable("files")).toBe(true);
describe("correct startup and indexing", () => {
test("adds tables to db", async () => {
expect(await mddb.db.schema.hasTable(Table.Files)).toBe(true);
expect(await mddb.db.schema.hasTable(Table.Tags)).toBe(true);
expect(await mddb.db.schema.hasTable(Table.FileTags)).toBe(true);
expect(await mddb.db.schema.hasTable(Table.Links)).toBe(true);
});

// Ensure there is a "tags" table
expect(await db.schema.hasTable("tags")).toBe(true);
test("indexes all files in folder", async () => {
const allFiles = recursiveWalkDir(pathToContentFixture);
const allIndexedFiles = await mddb.getFiles();
expect(allIndexedFiles).toHaveLength(allFiles.length);
});
});

// Ensure there is a "file_tags" table
expect(await db.schema.hasTable("file_tags")).toBe(true);
describe("querying files", () => {
test("can get all files", async () => {
const dbFiles = await mddb.getFiles();
const dbFilesPaths = dbFiles.map((f) => f.file_path);
const allFilesPaths = recursiveWalkDir(pathToContentFixture);

const myMdDb = markdowndb.Database("markdown.db");
expect(dbFiles).toHaveLength(allFilesPaths.length);
dbFilesPaths.forEach((p) => {
expect(allFilesPaths).toContain(p);
});
});

// Check if all files were indexed
const allFiles = walk(pathToFixturesFolder);
const allFilesCount = allFiles.length;
test("can query by file type", async () => {
const dbFiles = await mddb.getFiles({ filetypes: ["blog"] });
const dbFilesPaths = dbFiles.map((f) => f.file_path);

const allIndexedFiles = await myMdDb.query();
expect(allIndexedFiles.length).toBe(allFilesCount);
const expectedPaths = [
`${pathToContentFixture}/blog/blog3.mdx`,
`${pathToContentFixture}/blog/blog2.mdx`,
`${pathToContentFixture}/blog0.mdx`,
];

// Check if querying by folder is working
const blogFiles = allFiles.filter((p) =>
p.startsWith(`${pathToFixturesFolder}/blog/`)
);
const blogFilesCount = blogFiles.length;
expect(dbFilesPaths).toHaveLength(expectedPaths.length);
dbFilesPaths.forEach((p) => {
expect(expectedPaths).toContain(p);
});
});

const indexedBlogFiles = await myMdDb.query({
folder: "blog",
filetypes: ["md", "mdx"],
test("can query by tags", async () => {
const dbFiles = await mddb.getFiles({ tags: ["economy", "politics"] });
const dbFilesPaths = dbFiles.map((f) => f.file_path);

const expectedPaths = [
`${pathToContentFixture}/blog/blog3.mdx`,
`${pathToContentFixture}/blog/blog2.mdx`,
];

expect(dbFilesPaths).toHaveLength(expectedPaths.length);
dbFilesPaths.forEach((p) => {
expect(expectedPaths).toContain(p);
});
});

test("can query by extensions", async () => {
const dbFiles = await mddb.getFiles({ extensions: ["png"] });
const dbFilesPaths = dbFiles.map((f) => f.file_path);

const expectedPaths = [
`${pathToContentFixture}/assets/datopian-logo.png`,
];

expect(dbFilesPaths).toHaveLength(expectedPaths.length);
dbFilesPaths.forEach((p) => {
expect(expectedPaths).toContain(p);
});
});

expect(indexedBlogFiles.length).toBe(blogFilesCount);
test("can query by tags AND filetypes AND extensions", async () => {
const dbFiles = await mddb.getFiles({
tags: ["culture"],
filetypes: ["news"],
extensions: ["md", "mdx"],
});
const dbFilesPaths = dbFiles.map((f) => f.file_path);
const expectedPaths = [`${pathToContentFixture}/news/news1.mdx`];

expect(dbFilesPaths).toHaveLength(expectedPaths.length);
dbFilesPaths.forEach((p) => {
expect(expectedPaths).toContain(p);
});
});

// Check if querying by tags is working
const economyFiles = await myMdDb.query({ tags: ["economy"] });
const economyFilesPaths = economyFiles.map((f) => f._path);
test("can find file by url path", async () => {
const dbFile = await mddb.getFileByUrl("blog/blog2");
expect(dbFile.url_path).toBe("blog/blog2");
});

const expectedPaths = [
`${pathToFixturesFolder}/blog/blog3.mdx`,
`${pathToFixturesFolder}/blog/blog2.mdx`,
];
test("can find file by id", async () => {
const dbFile = await mddb.getFileByUrl("blog/blog2");
const dbFileById = await mddb.getFileById(dbFile._id);
expect(dbFileById.url_path).toBe("blog/blog2");
});
});

expect(economyFilesPaths).toHaveLength(expectedPaths.length);
economyFilesPaths.forEach((p) => {
expect(expectedPaths).toContain(p);
describe("getTags", () => {
// TODO the list of tags in db should be defined in some config file instead of being extracted from all the files
test("can get all tags", async () => {
const dbTags = await mddb.getTags();
const extectedTags = [
{ name: "economy" },
{ name: "politics" },
{ name: "sports" },
{ name: "culture" },
];

expect(dbTags).toHaveLength(extectedTags.length);
dbTags.forEach((t) => {
expect(extectedTags).toContainEqual(t);
});
});
});

// Check if querying by filetypes is working
const pngFiles = await myMdDb.query({ filetypes: ["png"] });
expect(
pngFiles
.map((f) => f.filetype)
// Filter out duplicates
.filter((v, i, s) => {
return s.indexOf(v) === i;
})
).toEqual(["png"]);

db.destroy();
myMdDb._destroyDb();
describe("getLinks", () => {
test("can get all forward links of a file", async () => {
const fromFile = await mddb.getFileByUrl("blog/blog2");
const toFile = await mddb.getFileByUrl("blog0");

const forwardLinks = await mddb.getLinks({
fileId: fromFile._id,
});
expect(forwardLinks.length).toBe(1);
expect(forwardLinks[0].to).toBe(toFile._id);
});

test("can get all backward links of a file", async () => {
const toFile = await mddb.getFileByUrl("blog/blog2");
const fromFile1 = await mddb.getFileByUrl("blog0");
const fromFile2 = await mddb.getFileByUrl("blog/blog1");

const backwardLinks = await mddb.getLinks({
fileId: toFile._id,
direction: "backward",
});
const backwardLinksFileIds = backwardLinks.map((l) => l.from);
expect(backwardLinksFileIds).toHaveLength(2);
expect(backwardLinksFileIds).toContain(fromFile1._id);
expect(backwardLinksFileIds).toContain(fromFile2._id);
});
});
});

const walk = (dir: fs.PathLike) => {
let files: string[] = [];
for (const item of fs.readdirSync(dir)) {
if (!(dir as string).endsWith("/")) {
dir += "/";
}

const fullPath = dir + item;
const stat = fs.statSync(fullPath);

if (stat.isDirectory()) {
files = files.concat(walk(fullPath));
} else if (stat.isFile()) {
files.push(fullPath);
}
}
return files;
};
test("can query by folder", async () => {
const allBlogFiles = recursiveWalkDir(`${pathToContentFixture}/blog`);
const indexedBlogFiles = await mddb.getFiles({
folder: "blog",
});
expect(indexedBlogFiles.length).toBe(allBlogFiles.length);
});
});
Loading

0 comments on commit ad65355

Please sign in to comment.