Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dummy inference engine #325 #331

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open
32 changes: 32 additions & 0 deletions exo/inference/DummyInferenceEngine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import random
import asyncio
import numpy as np

class DummyInferenceEngine:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This needs to implement the InferenceEngine interface

def __init__(self, output_type="static", output_value=None, output_shape=(1,), latency_mean=0.1, latency_stddev=0.1):
self.output_type = output_type
self.output_value = output_value
self.output_shape = output_shape
self.latency_mean = latency_mean
self.latency_stddev = latency_stddev

# Validation for static output type
if self.output_type == "static" and self.output_value is None:
raise ValueError("output_value must be provided when output_type is 'static'.")

async def run_inference(self):
# Simulate latency
latency = max(0, random.normalvariate(self.latency_mean, self.latency_stddev)) # Non-negative latency
await asyncio.sleep(latency)

# Generate output based on the specified output type
if self.output_type == "static":
return self.output_value # Return the static output
elif self.output_type == "random":
self.output_value = np.random.randn(*self.output_shape).tolist() # Generate random output and store it
return self.output_value

async def get_latency(self):
# Simulate and return the latency
return max(0, random.normalvariate(self.latency_mean, self.latency_stddev))

3 changes: 3 additions & 0 deletions exo/inference/inference_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,8 @@ def get_inference_engine(inference_engine_name: str, shard_downloader: 'ShardDow
tinygrad.helpers.DEBUG.value = int(os.getenv("TINYGRAD_DEBUG", default="0"))

return TinygradDynamicShardInferenceEngine(shard_downloader)
elif inference_engine_name == "dummy" :
from exo.inference.DummyInferenceEngine import DummyInferenceEngine
return DummyInferenceEngine(output_type="static", output_value=[1, 2, 3], latency_mean=0.1, latency_stddev=0.1)
else:
raise ValueError(f"Inference engine {inference_engine_name} not supported")
60 changes: 60 additions & 0 deletions exo/inference/test_dummy_inference_engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import asyncio
import pytest
from exo.inference.DummyInferenceEngine import DummyInferenceEngine


@pytest.mark.asyncio
async def test_dummy_engine():
dummy_engine = DummyInferenceEngine(output_type="random", output_shape=(2, 2), latency_mean=0.5, latency_stddev=0.1)

# Simulate inference
output = await dummy_engine.run_inference()
latency = await dummy_engine.get_latency()

assert isinstance(output, list), "Output should be a list."
assert isinstance(latency, float), "Latency should be a float."



@pytest.mark.asyncio
async def test_dummy_inference_engine_static():
# Test with static output
dummy_engine = DummyInferenceEngine(output_type="static", output_value=[1, 2, 3], latency_mean=0.2, latency_stddev=0.1)
await dummy_engine.run_inference() # Simulate inference, check for errors
assert dummy_engine.output_value == [1, 2, 3], "The static output should match the provided value."


@pytest.mark.asyncio
async def test_dummy_inference_engine_random():
# Test with random output
dummy_engine = DummyInferenceEngine(output_type="random", output_shape=(128, 128), latency_mean=0.1, latency_stddev=0.1)
await dummy_engine.run_inference() # Simulate inference, check for errors
output = dummy_engine.output_value

# Check that the output is a list and has the correct shape
assert isinstance(output, list), "Output should be a list."
assert len(output) == 128, "Output should have the specified outer shape."

# Check each sub-list for the correct length
for sublist in output:
assert isinstance(sublist, list), "Each output item should be a list."
assert len(sublist) == 128, "Each output sub-list should have the specified inner shape."

# Optionally check the type of each element
for sublist in output:
for element in sublist:
assert isinstance(element, (float, int)), "Each element should be a float or int."



@pytest.mark.asyncio
async def test_dummy_inference_engine_latency():
for i in range(10):
# Test that latency is within expected range
dummy_engine = DummyInferenceEngine(output_type="static", output_value=[1], latency_mean=0.1, latency_stddev=0.0)
start_time = asyncio.get_event_loop().time()
await dummy_engine.run_inference()
elapsed_time = asyncio.get_event_loop().time() - start_time
# Allow a small tolerance (e.g., 0.1 seconds) for system-level deviations
tolerance = 0.1
assert abs(elapsed_time - 0.1) <= tolerance, f"Expected latency to be around 0.1s, but got {elapsed_time}s."
2 changes: 1 addition & 1 deletion exo/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
parser.add_argument("--chatgpt-api-port", type=int, default=8000, help="ChatGPT API port")
parser.add_argument("--chatgpt-api-response-timeout", type=int, default=90, help="ChatGPT API response timeout in seconds")
parser.add_argument("--max-generate-tokens", type=int, default=10000, help="Max tokens to generate in each request")
parser.add_argument("--inference-engine", type=str, default=None, help="Inference engine to use")
parser.add_argument("--inference-engine", type=str, default=None, help="Inference engine to use e.g. 'mlx', 'tinygrad', 'dummy')")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't actually resolve the inference engine to dummy you need to change the code for that too. Please think through your code changes as right now this doesn't fit together at all. I'd like you to run this end-to-end with the DummyInferenceEngine before you submit it.

parser.add_argument("--disable-tui", action=argparse.BooleanOptionalAction, help="Disable TUI")
parser.add_argument("--run-model", type=str, help="Specify a model to run directly")
parser.add_argument("--prompt", type=str, help="Prompt for the model when using --run-model", default="Who are you?")
Expand Down
5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,7 @@ max-line-length = 200

[tool.autopep8]
max_line_length = 200
indent_size = 2
indent_size = 2

[tool.pytest.ini_options]
asyncio_default_fixture_loop_scope = "function"