Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

parquet-compression for lgbm #15

Draft
wants to merge 42 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
f10b51e
add: first try at model parsing
YYYasin19 Feb 20, 2023
bd952cd
try pyarrow
YYYasin19 Feb 21, 2023
0069ca0
start with pyarrow encoding
YYYasin19 Feb 22, 2023
8e6177e
somewhat optimize i guess
YYYasin19 Feb 23, 2023
cc0cffb
encode & decode pyarrow table to parquet
YYYasin19 Mar 4, 2023
2fb8b84
cherry-pick result
YYYasin19 Mar 5, 2023
3594a41
save
YYYasin19 Mar 5, 2023
92178e1
update name
YYYasin19 Mar 5, 2023
439af86
remove explicit schema selection (temp?)
YYYasin19 Mar 5, 2023
78449aa
rename compression_utils to compression
YYYasin19 Mar 5, 2023
21071de
refactor: utils component
YYYasin19 Mar 5, 2023
43b9f0c
store leaf_values seperately
YYYasin19 Mar 5, 2023
91be951
add: uncompress tree
YYYasin19 Mar 5, 2023
5b44aba
commit
YYYasin19 Mar 6, 2023
e3b9217
fix: deps
YYYasin19 Mar 6, 2023
37ed31f
all models pls
YYYasin19 Mar 6, 2023
cc8228b
suppress warnings
YYYasin19 Mar 6, 2023
660450b
cleanup
YYYasin19 Mar 6, 2023
7209db2
update benchmark :(
YYYasin19 Mar 6, 2023
54e2889
add: benchmark file for optimizing performance of handle generation
YYYasin19 Mar 6, 2023
fc09de9
push comments
YYYasin19 Mar 7, 2023
0372263
add linear features parsing and adapt tests
YYYasin19 Mar 17, 2023
5dbf70e
add linear features parsing and adapt tests
YYYasin19 Mar 17, 2023
002de1d
Merge branch 'main' into model-parsing
pavelzw Mar 19, 2023
62e20c7
Merge branch 'main' into model-parsing
pavelzw Mar 19, 2023
b28176d
Merge branch 'main' into model-parsing
pavelzw Mar 19, 2023
2eb48fa
Merge branch 'main' into model-parsing
pavelzw Mar 20, 2023
45c61f2
it works (tm)
YYYasin19 Mar 24, 2023
e76045e
review changes
YYYasin19 Mar 24, 2023
3fd5a45
review changes part 2
YYYasin19 Mar 24, 2023
c78c807
apparently needed bc. CI is slow af
YYYasin19 Mar 24, 2023
c94c29c
merge
YYYasin19 Mar 24, 2023
9052d40
re-rename
YYYasin19 Mar 24, 2023
9b9f146
still too slow
YYYasin19 Mar 24, 2023
7cfbd3c
Fix typo
pavelzw Mar 25, 2023
2aabb6a
add pandas dependency
YYYasin19 Mar 25, 2023
6149ba4
add lenght validation
YYYasin19 Mar 25, 2023
b8fc383
try higher compression level for lz4 (16)
YYYasin19 Mar 25, 2023
fcfa7b3
fix: int arg
YYYasin19 Mar 25, 2023
db30b19
switch to zstd, seems better here?
YYYasin19 Mar 26, 2023
a7266c9
remove linear tree
YYYasin19 Mar 26, 2023
49834ac
Merge branch 'main' into model-parsing
pavelzw Mar 27, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ dependencies:
- pre-commit
- pandas
- scikit-learn
- pyarrow
YYYasin19 marked this conversation as resolved.
Show resolved Hide resolved
- pytest>=7.0
- setuptools_scm
- tbump
2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,6 @@ packages = find:
scikit-learn =
scikit-learn
lightgbm =
pandas
lightgbm
pyarrow
297 changes: 208 additions & 89 deletions slim_trees/lgbm_booster.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,27 @@
import pickle
import re
import sys
from typing import Any, BinaryIO, List, Tuple
from typing import Any, BinaryIO, List, Optional, Tuple

import numpy as np
import pandas as pd

from slim_trees import __version__ as slim_trees_version
from slim_trees.compression_utils import (
compress_half_int_float_array,
decompress_half_int_float_array,
safe_cast,
)
from slim_trees.utils import check_version
from slim_trees.utils import check_version, df_to_pq_bytes, pq_bytes_to_df

FRONT_STRING_REGEX = r"(?:\w+(?:=.*)?\n)*\n(?=Tree)"
BACK_STRING_REGEX = r"end of trees(?:\n)+(?:.|\n)*"
TREE_GROUP_REGEX = r"(Tree=\d+\n+)((?:.+\n)*)\n\n"

SPLIT_FEATURE_DTYPE = np.int16
THRESHOLD_DTYPE = np.float64
DECISION_TYPE_DTYPE = np.int8
LEFT_CHILD_DTYPE = np.int16
RIGHT_CHILD_DTYPE = LEFT_CHILD_DTYPE
LEAF_VALUE_DTYPE = np.float64

try:
from lightgbm.basic import Booster
Expand Down Expand Up @@ -68,124 +78,233 @@ def _decompress_booster_state(compressed_state: dict):
return state


def _compress_booster_handle(model_string: str) -> Tuple[str, List[dict], str]:
def _extract_feature(feature_line: str) -> Tuple[str, List[str]]:
feat_name, values_str = feature_line.split("=")
return feat_name, values_str.split(" ")


def _validate_feature_lengths(feats_map: dict):
# features on tree-level
assert len(feats_map["num_leaves"]) == 1
assert len(feats_map["num_cat"]) == 1
assert len(feats_map["is_linear"]) == 1
assert len(feats_map["shrinkage"]) == 1

# features on node-level
num_leaves = int(feats_map["num_leaves"][0])
assert len(feats_map["split_feature"]) == num_leaves - 1
assert len(feats_map["threshold"]) == num_leaves - 1
assert len(feats_map["decision_type"]) == num_leaves - 1
assert len(feats_map["left_child"]) == num_leaves - 1
assert len(feats_map["right_child"]) == num_leaves - 1

# features on leaf-level
num_leaves = int(feats_map["num_leaves"][0])
assert len(feats_map["leaf_value"]) == num_leaves


def parse(str_list, dtype):
if np.can_cast(dtype, np.int64):
int64_array = np.array(str_list, dtype=np.int64)
return safe_cast(int64_array, dtype)
assert np.can_cast(dtype, np.float64)
return np.array(str_list, dtype=dtype)


def _compress_booster_handle(
model_string: str,
) -> Tuple[str, bytes, bytes, bytes, Optional[bytes], str]:
if not model_string.startswith("tree\nversion=v3"):
raise ValueError("Only v3 is supported for the booster string format.")
FRONT_STRING_REGEX = r"(?:\w+(?:=.*)?\n)*\n(?=Tree)"
BACK_STRING_REGEX = r"end of trees(?:\n)+(?:.|\n)*"
TREE_GROUP_REGEX = r"(Tree=\d+\n+)((?:.+\n)*)\n\n"

def _extract_feature(feature_line):
feat_name, values_str = feature_line.split("=")
return feat_name, values_str.split(" ")

front_str_match = re.search(FRONT_STRING_REGEX, model_string)
if front_str_match is None:
raise ValueError("Could not find front string.")
front_str = front_str_match.group()
# delete tree_sizes line since this messes up the tree parsing by LightGBM if not set correctly
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason why this line is deleted? 🤨

# todo calculate correct tree_sizes
front_str = re.sub(r"tree_sizes=(?:\d+ )*\d+\n", "", front_str)
front_str = re.sub(r"tree_sizes=(?:\d+ )*\d+\n", "", front_str_match.group())

back_str_match = re.search(BACK_STRING_REGEX, model_string)
if back_str_match is None:
raise ValueError("Could not find back string.")
back_str = back_str_match.group()

tree_matches = re.findall(TREE_GROUP_REGEX, model_string)
node_features: List[dict] = []
leaf_values: List[dict] = []
trees: List[dict] = []
for i, tree_match in enumerate(tree_matches):
linear_values: List[dict] = []
for _i, tree_match in enumerate(tree_matches):
tree_name, features_list = tree_match
_, tree_idx = tree_name.replace("\n", "").split("=")
assert int(tree_idx) == i

# extract features -- filter out empty ones
features = [f for f in features_list.split("\n") if "=" in f]
feats_map = dict(_extract_feature(fl) for fl in features)

def parse(str_list, dtype):
if np.can_cast(dtype, np.int64):
int64_array = np.array(str_list, dtype=np.int64)
return safe_cast(int64_array, dtype)
assert np.can_cast(dtype, np.float64)
return np.array(str_list, dtype=dtype)

split_feature_dtype = np.int16
threshold_dtype = np.float64
decision_type_dtype = np.int8
left_child_dtype = np.int16
right_child_dtype = left_child_dtype
leaf_value_dtype = np.float64
assert len(feats_map["num_leaves"]) == 1
assert len(feats_map["num_cat"]) == 1
assert len(feats_map["is_linear"]) == 1
assert len(feats_map["shrinkage"]) == 1
# validate that we have the correct lengths
_validate_feature_lengths(feats_map)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adding a comment explaining that we have different kinds of sizes which results in four different tables might be helpful.

# length = 1
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe assert that the length is actually of that size?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We do check most of these above, don't we?
This is just a comment for future us so we remember the length of these features

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm yes, we do check most of the length = 1 things...

I think I also meant the length = num_inner_nodes = num_leaves - 1 and length = num_leaves parts... We could add length as a parameter to parse() and asserts the length there for the rest?

YYYasin19 marked this conversation as resolved.
Show resolved Hide resolved
trees.append(
{
"tree_idx": int(tree_idx),
"num_leaves": int(feats_map["num_leaves"][0]),
"num_cat": int(feats_map["num_cat"][0]),
"split_feature": parse(feats_map["split_feature"], split_feature_dtype),
"threshold": compress_half_int_float_array(
parse(feats_map["threshold"], threshold_dtype)
),
"decision_type": parse(feats_map["decision_type"], decision_type_dtype),
"left_child": parse(feats_map["left_child"], left_child_dtype),
"right_child": parse(feats_map["right_child"], right_child_dtype),
"leaf_value": parse(feats_map["leaf_value"], leaf_value_dtype),
"is_linear": int(feats_map["is_linear"][0]),
"shrinkage": float(feats_map["shrinkage"][0]),
}
)
return front_str, trees, back_str

# length = num_inner_nodes = num_leaves - 1
node_features.append(
{
"tree_idx": int(tree_idx),
# all the upcoming attributes have length num_leaves - 1
"split_feature": parse(feats_map["split_feature"], SPLIT_FEATURE_DTYPE),
"threshold": parse(feats_map["threshold"], THRESHOLD_DTYPE),
"decision_type": parse(feats_map["decision_type"], DECISION_TYPE_DTYPE),
"left_child": parse(feats_map["left_child"], LEFT_CHILD_DTYPE),
"right_child": parse(feats_map["right_child"], RIGHT_CHILD_DTYPE),
}
)

# length = num_leaves
leaf_values.append(
{
"tree_idx": int(tree_idx),
"leaf_value": parse(feats_map["leaf_value"], LEAF_VALUE_DTYPE),
}
)

# length = sum_l=0^{num_leaves} {num_features(l)}
YYYasin19 marked this conversation as resolved.
Show resolved Hide resolved
# attributes: leaf_features, leaf_coeff, leaf_const, num_features\
# TODO: some of these attributes, e.g. leaf_const, might not be needed
if "leaf_features" in feats_map:
leaf_values[-1]["leaf_const"] = parse(
feats_map["leaf_const"], LEAF_VALUE_DTYPE
)
leaf_values[-1]["num_features"] = parse(feats_map["num_features"], np.int32)

linear_values.append(
{
"tree_idx": int(tree_idx),
"leaf_features": parse(
[s if s else -1 for s in feats_map["leaf_features"]],
np.int16,
),
"leaf_coeff": parse(
[s if s else None for s in feats_map["leaf_coeff"]], np.float64
),
}
)

tree_value_bytes = df_to_pq_bytes(pd.DataFrame(trees))

nodes_df = pd.DataFrame(node_features)
node_values_bytes = df_to_pq_bytes(
nodes_df.explode(
[
"split_feature",
"threshold",
"decision_type",
"left_child",
"right_child",
]
)
)

leaf_values_bytes = df_to_pq_bytes(
pd.DataFrame(leaf_values).explode(
["leaf_value"] + (["leaf_const", "num_features"] if linear_values else [])
)
)

linear_values_bytes = None
if linear_values:
linear_values_bytes = df_to_pq_bytes(
pd.DataFrame(linear_values).explode(["leaf_features", "leaf_coeff"])
)

return (
front_str,
tree_value_bytes,
node_values_bytes,
leaf_values_bytes,
linear_values_bytes,
back_str,
)


def _decompress_booster_handle(compressed_state: Tuple[str, List[dict], str]) -> str:
front_str, trees, back_str = compressed_state
def _decompress_booster_handle(
compressed_state: Tuple[str, bytes, bytes, bytes, bytes, str]
) -> str:
(
front_str,
trees_df_bytes,
nodes_df_bytes,
leaf_value_bytes,
linear_values_bytes,
back_str,
) = compressed_state
assert type(front_str) == str
assert type(trees) == list
assert type(back_str) == str

handle = front_str

for i, tree in enumerate(trees):
assert type(tree) == dict
assert tree.keys() == {
"num_leaves",
"num_cat",
"split_feature",
"threshold",
"decision_type",
"left_child",
"right_child",
"leaf_value",
"is_linear",
"shrinkage",
}

num_leaves = len(tree["leaf_value"])
num_nodes = len(tree["split_feature"])

tree_str = f"Tree={i}\n"
tree_str += f"num_leaves={tree['num_leaves']}\nnum_cat={tree['num_cat']}\nsplit_feature="
tree_str += " ".join([str(x) for x in tree["split_feature"]])
tree_str += "\nsplit_gain=" + ("0 " * num_nodes)[:-1]
threshold = decompress_half_int_float_array(tree["threshold"])
tree_str += "\nthreshold=" + " ".join([str(x) for x in threshold])
tree_str += "\ndecision_type=" + " ".join(
[str(x) for x in tree["decision_type"]]
trees_df = pq_bytes_to_df(trees_df_bytes)
nodes_df = pq_bytes_to_df(nodes_df_bytes).groupby("tree_idx").agg(lambda x: list(x))
leaf_values_df = (
pq_bytes_to_df(leaf_value_bytes).groupby("tree_idx").agg(lambda x: list(x))
)

# merge trees_df, nodes_df, and leaf_values_df on tree_idx
trees_df = trees_df.merge(nodes_df, on="tree_idx")
trees_df = trees_df.merge(leaf_values_df, on="tree_idx")
if linear_values_bytes is not None:
linear_values_df = (
pq_bytes_to_df(linear_values_bytes)
.groupby("tree_idx")
.agg(lambda x: list(x))
)
tree_str += "\nleft_child=" + " ".join([str(x) for x in tree["left_child"]])
tree_str += "\nright_child=" + " ".join([str(x) for x in tree["right_child"]])
tree_str += "\nleaf_value=" + " ".join([str(x) for x in tree["leaf_value"]])
tree_str += "\nleaf_weight=" + ("0 " * num_leaves)[:-1]
tree_str += "\nleaf_count=" + ("0 " * num_leaves)[:-1]
tree_str += "\ninternal_value=" + ("0 " * num_nodes)[:-1]
tree_str += "\ninternal_weight=" + ("0 " * num_nodes)[:-1]
tree_str += "\ninternal_count=" + ("0 " * num_nodes)[:-1]
tree_str += f"\nis_linear={tree['is_linear']}"
tree_str += f"\nshrinkage={tree['shrinkage']}"
tree_str += "\n\n\n"

handle += tree_str
handle += back_str
return handle
trees_df = trees_df.merge(linear_values_df, on="tree_idx")

tree_strings = [front_str]

for i, tree in trees_df.iterrows():
num_leaves = int(tree["num_leaves"])
num_nodes = num_leaves - 1

# add the appropriate block if those values are present
if tree["is_linear"]:
linear_str = f"""
leaf_const={" ".join(str(x) for x in tree['leaf_const'])}
num_features={" ".join(str(x) for x in tree['num_features'])}
leaf_features={" ".join(["" if f == -1 else str(int(f)) for f in tree['leaf_features']])}
leaf_coeff={" ".join(["" if np.isnan(f) else str(f) for f in tree['leaf_coeff']])}"""
else:
linear_str = ""

tree_strings.append(
f"""Tree={i}
num_leaves={int(tree["num_leaves"])}
num_cat={tree['num_cat']}
split_feature={' '.join([str(x) for x in tree["split_feature"]])}
split_gain={("0" * num_nodes)[:-1]}
threshold={' '.join([str(x) for x in tree['threshold']])}
decision_type={' '.join([str(x) for x in tree["decision_type"]])}
left_child={" ".join([str(x) for x in tree["left_child"]])}
right_child={" ".join([str(x) for x in tree["right_child"]])}
leaf_value={" ".join([str(x) for x in tree["leaf_value"]])}
leaf_weight={("0 " * num_leaves)[:-1]}
leaf_count={("0 " * num_leaves)[:-1]}
internal_value={("0 " * num_nodes)[:-1]}
internal_weight={("0 " * num_nodes)[:-1]}
internal_count={("0 " * num_nodes)[:-1]}
is_linear={tree['is_linear']}{linear_str}
shrinkage={tree['shrinkage']}


"""
)

tree_strings.append(back_str)

return "".join(tree_strings)
Loading