Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: align data layer with python side #1144

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
1 change: 1 addition & 0 deletions packages/core/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@
"devDependencies": {
"ajv": "^8.17.1",
"bunchee": "5.3.2",
"change-case": "^5.4.4",
"natural": "^8.0.1"
},
"dependencies": {
Expand Down
1 change: 1 addition & 0 deletions packages/core/src/schema/index.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
export * from "./node";
export { DATA_KEY, TYPE_KEY, fromDocStore, type DocJson } from "./python";
export { FileReader, TransformComponent, type BaseReader } from "./type";
export { EngineResponse } from "./type/engine–response";
export * from "./zod";
32 changes: 11 additions & 21 deletions packages/core/src/schema/node.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@ export enum NodeRelationship {
}

export enum ObjectType {
TEXT = "TEXT",
IMAGE = "IMAGE",
INDEX = "INDEX",
DOCUMENT = "DOCUMENT",
IMAGE_DOCUMENT = "IMAGE_DOCUMENT",
TEXT = "1",
IMAGE = "2",
INDEX = "3",
DOCUMENT = "4",
IMAGE_DOCUMENT = "5", // Python side doesn't have this enum
}

export enum MetadataMode {
Expand Down Expand Up @@ -76,7 +76,6 @@ export abstract class BaseNode<T extends Metadata = Metadata> {
excludedEmbedMetadataKeys,
excludedLlmMetadataKeys,
relationships,
hash,
embedding,
} = init || {};
this.id_ = id_ ?? randomUUID();
Expand Down Expand Up @@ -177,13 +176,12 @@ export abstract class BaseNode<T extends Metadata = Metadata> {
return {
...this,
type: this.type,
// hash is an accessor property, so it's not included in the rest operator
hash: this.hash,
// no `hash` here to align with Python side
};
}

clone(): BaseNode {
return jsonToNode(this.toMutableJSON()) as BaseNode;
return jsonToNode(this.toMutableJSON(), this.type);
}

/**
Expand Down Expand Up @@ -224,27 +222,19 @@ export class TextNode<T extends Metadata = Metadata> extends BaseNode<T> {
init;
this.text = text ?? "";
this.textTemplate = textTemplate ?? "";
if (startCharIdx) {
if (startCharIdx !== undefined) {
this.startCharIdx = startCharIdx;
}
if (endCharIdx) {
if (endCharIdx !== undefined) {
this.endCharIdx = endCharIdx;
}
this.metadataSeparator = metadataSeparator ?? "\n";
}

/**
* Generate a hash of the text node.
* The ID is not part of the hash as it can change independent of content.
* @returns
*/
generateHash() {
const hashFunction = createSHA256();
hashFunction.update(`type=${this.type}`);
hashFunction.update(
`startCharIdx=${this.startCharIdx} endCharIdx=${this.endCharIdx}`,
);
hashFunction.update(this.getContent(MetadataMode.ALL));
const docIdentity = this.text + JSON.stringify(this.metadata);
hashFunction.update(docIdentity);
return hashFunction.digest();
}

Expand Down
72 changes: 72 additions & 0 deletions packages/core/src/schema/python.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
/**
* Python adapter for the schema.
*/
import { jsonToNode, ObjectType } from "./node";

export const TYPE_KEY = "__type__";
export const DATA_KEY = "__data__";

async function camelCaseJson(json: Record<string, any>) {
const { camelCase } = await import("change-case");
return Object.entries(json).reduce(
(acc, [key, value]) => {
acc[
camelCase(key, {
suffixCharacters: "_",
})
] = value;
return acc;
},
{} as Record<string, any>,
);
}

const PYTHON_TO_JS_TYPE_MAP = {
"1": ObjectType.TEXT,
"2": ObjectType.IMAGE,
"3": ObjectType.INDEX,
"4": ObjectType.DOCUMENT,
};

const LEGACY_JS_MAP = {
TEXT: ObjectType.TEXT,
IMAGE: ObjectType.IMAGE,
INDEX: ObjectType.INDEX,
DOCUMENT: ObjectType.DOCUMENT,
IMAGE_DOCUMENT: ObjectType.DOCUMENT,
};

export type DocJson = {
[TYPE_KEY]: string;
[DATA_KEY]: string;
};

async function fromImpl(data: Record<string, unknown>) {
const convertedJson = await camelCaseJson(data);
if (convertedJson.relationships) {
for (const [key, value] of Object.entries(convertedJson.relationships)) {
if (typeof value === "object" && value !== null) {
convertedJson.relationships[key] = await camelCaseJson(value);
} else if (Array.isArray(value)) {
convertedJson.relationships[key] = await Promise.all(
value.map((v) => camelCaseJson(v)),
);
}
}
}
return convertedJson;
}

export async function fromDocStore({
[TYPE_KEY]: type,
[DATA_KEY]: data,
}: DocJson) {
if (!(type in PYTHON_TO_JS_TYPE_MAP) && !(type in LEGACY_JS_MAP)) {
throw new Error("Invalid type");
}
const objectType =
PYTHON_TO_JS_TYPE_MAP[type as keyof typeof PYTHON_TO_JS_TYPE_MAP] ||
LEGACY_JS_MAP[type as keyof typeof LEGACY_JS_MAP];
const convertedJson = await fromImpl(JSON.parse(data));
return jsonToNode(convertedJson, objectType);
}
9 changes: 6 additions & 3 deletions packages/core/tests/decorator.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,17 @@ describe("chunkSizeCheck", () => {
env.ENABLE_CHUNK_SIZE_CHECK = "true";

let message = "";
const consoleMock = vi
.spyOn(console, "warn")
.mockImplementation((msg) => (message += msg + "\n"));
vi.spyOn(console, "warn").mockImplementation(
(msg) => (message += msg + "\n"),
);

Settings.chunkSize = 0;

const node = new TextNode();
expect(message).toEqual("");
node.setContent("a".repeat(1024));
expect(message).toBe("");
node.getContent();
expect(message).toContain("is larger than chunk size");
});
});
77 changes: 72 additions & 5 deletions packages/core/tests/schema-node.test.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,74 @@
import { Document, TextNode } from "@llamaindex/core/schema";
import {
Document,
ObjectType,
TextNode,
fromPythonDocStore,

Check failure on line 5 in packages/core/tests/schema-node.test.ts

View workflow job for this annotation

GitHub Actions / typecheck

'"@llamaindex/core/schema"' has no exported member named 'fromPythonDocStore'. Did you mean 'fromDocStore'?
} from "@llamaindex/core/schema";
import { beforeEach, describe, expect, test } from "vitest";

describe("Python", () => {
test("from python doc store", async () => {
const node = await fromPythonDocStore({
__data__: JSON.stringify({
id_: "e86be4a7-2ad0-4c3c-937b-3140f562e7a7",
embedding: null,
metadata: {},
excluded_embed_metadata_keys: [],
excluded_llm_metadata_keys: [],
relationships: {
"1": {
node_id: "e1fe8fd0-f470-40cd-bc2e-be3a220cef94",
node_type: "4",
metadata: {},
hash: "191a8fdcf068d3ac831da23cde07a92efe1432243c7f628d1009aa2ecdf6cb03",
class_name: "RelatedNodeInfo",
},
},
text: "This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test.",
mimetype: "text/plain",
start_char_idx: 0,
end_char_idx: 1599,
text_template: "{metadata_str}\n\n{content}",
metadata_template: "{key}: {value}",
metadata_seperator: "\n",
class_name: "TextNode",
}),
__type__: "1",
});
expect(node.startCharIdx).toBe(0);
expect(node.endCharIdx).toBe(1599);
expect(node).toMatchInlineSnapshot(`
{
"embedding": null,
"endCharIdx": 1599,
"excludedEmbedMetadataKeys": [],
"excludedLlmMetadataKeys": [],
"id_": "e86be4a7-2ad0-4c3c-937b-3140f562e7a7",
"metadata": {},
"metadataSeparator": "
",
"relationships": {
"1": {
"className": "RelatedNodeInfo",
"hash": "191a8fdcf068d3ac831da23cde07a92efe1432243c7f628d1009aa2ecdf6cb03",
"metadata": {},
"nodeId": "e1fe8fd0-f470-40cd-bc2e-be3a220cef94",
"nodeType": "4",
},
},
"startCharIdx": 0,
"text": "This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test.",
"textTemplate": "{metadata_str}

{content}",
"type": "1",
}
`);
expect(node.id_).toBe("e86be4a7-2ad0-4c3c-937b-3140f562e7a7");
expect(node.type).toBe(ObjectType.TEXT);
});
});

describe("Document", () => {
let document: Document;

Expand All @@ -10,7 +78,7 @@

test("should generate a hash", () => {
expect(document.hash).toMatchInlineSnapshot(
`"1mkNkQC30mZlBBG48DNuG2WSKcTQ32DImC+4JUoVijg="`,
`"oznYDHYUGHArYnhRy9lj63IvEt/rNg1EH5EjwtPU/Pc="`,
);
});

Expand All @@ -30,7 +98,7 @@

test("should generate a hash", () => {
expect(node.hash).toMatchInlineSnapshot(
`"nTSKdUTYqR52MPv/brvb4RTGeqedTEqG9QN8KSAj2Do="`,
`"oznYDHYUGHArYnhRy9lj63IvEt/rNg1EH5EjwtPU/Pc="`,
);
});

Expand All @@ -52,7 +120,6 @@
"embedding": undefined,
"excludedEmbedMetadataKeys": [],
"excludedLlmMetadataKeys": [],
"hash": "Z6SWgFPlalaeblMGQGw0KS3qKgmZdEWXKfzEp/K+QN0=",
"id_": Any<String>,
"metadata": {
"something": 1,
Expand All @@ -63,7 +130,7 @@
"relationships": {},
"text": "Hello World",
"textTemplate": "",
"type": "TEXT",
"type": "1",
}
`,
);
Expand Down
14 changes: 10 additions & 4 deletions packages/llamaindex/src/ingestion/IngestionCache.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import type { BaseNode, TransformComponent } from "@llamaindex/core/schema";
import { MetadataMode } from "@llamaindex/core/schema";
import {
type BaseNode,
fromDocStore,
MetadataMode,
type TransformComponent,
} from "@llamaindex/core/schema";
import { createSHA256 } from "@llamaindex/env";
import { docToJson, jsonToDoc } from "../storage/docStore/utils.js";
import { docToJson } from "../storage/docStore/utils.js";
import { SimpleKVStore } from "../storage/kvStore/SimpleKVStore.js";
import type { BaseKVStore } from "../storage/kvStore/types.js";

Expand Down Expand Up @@ -63,6 +67,8 @@ export class IngestionCache {
if (!json || !json[this.nodesKey] || !Array.isArray(json[this.nodesKey])) {
return undefined;
}
return json[this.nodesKey].map((doc: any) => jsonToDoc(doc));
return Promise.all(
json[this.nodesKey].map((doc: any) => fromDocStore(doc)),
);
}
}
13 changes: 8 additions & 5 deletions packages/llamaindex/src/storage/docStore/KVDocumentStore.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import type { BaseNode } from "@llamaindex/core/schema";
import { ObjectType } from "@llamaindex/core/schema";
import {
type BaseNode,
fromDocStore,
ObjectType,
} from "@llamaindex/core/schema";
import _ from "lodash";
import { DEFAULT_NAMESPACE } from "../constants.js";
import type { BaseKVStore } from "../kvStore/types.js";
import type { RefDocInfo } from "./types.js";
import { BaseDocumentStore } from "./types.js";
import { docToJson, isValidDocJson, jsonToDoc } from "./utils.js";
import { docToJson, isValidDocJson } from "./utils.js";

type DocMetaData = { docHash: string; refDocId?: string };

Expand All @@ -29,7 +32,7 @@ export class KVDocumentStore extends BaseDocumentStore {
for (const key in jsonDict) {
const value = jsonDict[key];
if (isValidDocJson(value)) {
docs[key] = jsonToDoc(value);
docs[key] = await fromDocStore(value);
} else {
console.warn(`Invalid JSON for docId ${key}`);
}
Expand Down Expand Up @@ -94,7 +97,7 @@ export class KVDocumentStore extends BaseDocumentStore {
if (!isValidDocJson(json)) {
throw new Error(`Invalid JSON for docId ${docId}`);
}
return jsonToDoc(json);
return fromDocStore(json);
}

async getRefDocInfo(refDocId: string): Promise<RefDocInfo | undefined> {
Expand Down
30 changes: 1 addition & 29 deletions packages/llamaindex/src/storage/docStore/utils.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import type { BaseNode } from "@llamaindex/core/schema";
import { Document, ObjectType, TextNode } from "@llamaindex/core/schema";
import { ObjectType } from "@llamaindex/core/schema";

const TYPE_KEY = "__type__";
const DATA_KEY = "__data__";
Expand All @@ -24,31 +24,3 @@ export function docToJson(doc: BaseNode): DocJson {
[TYPE_KEY]: doc.type,
};
}

export function jsonToDoc(docDict: DocJson): BaseNode {
const docType = docDict[TYPE_KEY];
const dataDict = JSON.parse(docDict[DATA_KEY]);
let doc: BaseNode;

if (docType === ObjectType.DOCUMENT) {
doc = new Document({
text: dataDict.text,
id_: dataDict.id_,
embedding: dataDict.embedding,
hash: dataDict.hash,
metadata: dataDict.metadata,
});
} else if (docType === ObjectType.TEXT) {
doc = new TextNode({
text: dataDict.text,
id_: dataDict.id_,
hash: dataDict.hash,
metadata: dataDict.metadata,
relationships: dataDict.relationships,
});
} else {
throw new Error(`Unknown doc type: ${docType}`);
}

return doc;
}
Loading
Loading