Skip to content

Commit 300176b

Browse files
authored
Merge pull request #15 from kaushikmitr/main
Simulation code for llm inference gateway
2 parents 844dd00 + 5d48994 commit 300176b

9 files changed

+3908
-0
lines changed

simulations/llm_ig_simulation/src/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
import argparse
2+
from collections import Counter
3+
import csv
4+
from datetime import datetime
5+
import numpy as np
6+
import simpy
7+
from llmactor import LLMActor
8+
from loadbalancer import LoadBalancer
9+
10+
def main():
11+
parser = argparse.ArgumentParser(description="Simulate LLM load balancing with configurable parameters.")
12+
parser.add_argument("--rates_lo", nargs='+', type=int, default=[35, 30, 25, 20, 15, 10, 5, 1], help="List of low rates.")
13+
parser.add_argument("--rates_hi", nargs='+', type=int, default=[35, 30, 25, 20, 15, 10, 5, 1], help="List of high rates.")
14+
parser.add_argument("--no_of_messages", type=int, default=2500, help="Number of messages to simulate.")
15+
parser.add_argument("--mean_request_size_1", type=int, default=202, help="Mean request size for set 1.")
16+
parser.add_argument("--std_request_size_1", type=int, default=20, help="Standard deviation of request size for set 1.")
17+
parser.add_argument("--mean_output_size_1", type=int, default=179, help="Mean output size for set 1.")
18+
parser.add_argument("--std_output_size_1", type=int, default=17, help="Standard deviation of output size for set 1.")
19+
parser.add_argument("--mean_request_size_2", type=int, default=202, help="Mean request size for set 2.")
20+
parser.add_argument("--std_request_size_2", type=int, default=20, help="Standard deviation of request size for set 2.")
21+
parser.add_argument("--mean_output_size_2", type=int, default=179, help="Mean output size for set 2.")
22+
parser.add_argument("--std_output_size_2", type=int, default=17, help="Standard deviation of output size for set 2.")
23+
parser.add_argument("--queueing_perc", type=float, default=np.inf, help="Queueing percentage.")
24+
parser.add_argument('--target-latency-lo', nargs='+', type=float, help='List of target latencies for low priority requests.')
25+
parser.add_argument('--target-latency-hi', nargs='+', type=float, help='List of target latencies for high priority requests.')
26+
parser.add_argument('--prefix-latency-lo', nargs='+', type=float, help='List of prefix of target latencies for low priority requests.')
27+
parser.add_argument('--prefix-latency-hi', nargs='+', type=float, help='List of prefix of target latencies for high priority requests.')
28+
parser.add_argument('--number-of-servers', type=int, default=1, help='List of target latencies for high priority requests.')
29+
30+
args = parser.parse_args()
31+
32+
# Use provided arguments or defaults
33+
rates_lo = args.rates_lo
34+
rates_hi = args.rates_hi
35+
no_of_messages = args.no_of_messages
36+
SIM_DURATIONS = [no_of_messages / r + 100 for r in rates_lo]
37+
mean_request_size_1 = args.mean_request_size_1
38+
std_request_size_1 = args.std_request_size_1
39+
mean_output_size_1 = args.mean_output_size_1
40+
std_output_size_1 = args.std_output_size_1
41+
mean_request_size_2 = args.mean_request_size_2
42+
std_request_size_2 = args.std_request_size_2
43+
mean_output_size_2 = args.mean_output_size_2
44+
std_output_size_2 = args.std_output_size_2
45+
queueing_perc = args.queueing_perc
46+
lora_requested_lo = ""
47+
lora_requested_hi = ""
48+
target_latency_list_lo = args.target_latency_lo if args.target_latency_lo else [0.025]
49+
target_latency_list_hi = args.target_latency_hi if args.target_latency_hi else [0.5]
50+
prefix_latency_list_lo = args.prefix_latency_lo if args.prefix_latency_lo else ['lo']
51+
prefix_latency_list_hi = args.prefix_latency_hi if args.prefix_latency_hi else ['hi']
52+
number_of_servers = args.number_of_servers
53+
54+
# Define a structure to store results for all routing types
55+
results = {
56+
'leastPseudo': {'latency': [], 'latency_lo': [], 'latency_hi': [],
57+
'throughput_prefill': [], 'throughput_decode': [],
58+
'throughput_prefill_lo': [], 'throughput_decode_lo': [],
59+
'throughput_prefill_hi': [], 'throughput_decode_hi': [],
60+
'ttft': [], 'ttft_lo': [], 'ttft_hi': [],
61+
'tpot': [], 'tpot_lo': [], 'tpot_hi': [],
62+
'target_pods_lo': [], 'target_pods_hi': [],
63+
'recompute_cnt' : [], 'recompute_cnt_hi' : [], 'recompute_cnt_lo' : [],
64+
'pct_below_latency_target_lo': [], 'pct_below_latency_target_hi': [], 'queue_time_lo': [], 'queue_time_hi': [],
65+
'tol_lat_time_lo': [], 'tol_lat_time_hi': [],
66+
'avg_prefill_queue_size' : [],
67+
'avg_pending_tokens_perc' : [],
68+
'avg_actual_tokens_perc' : []},
69+
70+
'smart': {'latency': [], 'latency_lo': [], 'latency_hi': [],
71+
'estimated_latency': [], 'estimated_latency_lo': [], 'estimated_latency_hi': [],
72+
'throughput_prefill': [], 'throughput_decode': [],
73+
'throughput_prefill_lo': [], 'throughput_decode_lo': [],
74+
'throughput_prefill_hi': [], 'throughput_decode_hi': [],
75+
'ttft': [], 'ttft_lo': [], 'ttft_hi': [],
76+
'tpot': [], 'tpot_lo': [], 'tpot_hi': [],
77+
'target_pods_lo': [], 'target_pods_hi': [],
78+
'recompute_cnt' : [], 'recompute_cnt_hi' : [], 'recompute_cnt_lo' : [],
79+
'pct_below_latency_target_lo': [], 'pct_below_latency_target_hi': [],
80+
'pct_below_latency_target_lo': [], 'pct_below_latency_target_hi': [], 'queue_time_lo': [], 'queue_time_hi': [],
81+
'tol_lat_time_lo': [], 'tol_lat_time_hi': [],
82+
'avg_prefill_queue_size' : [],
83+
'avg_pending_tokens_perc' : [],
84+
'avg_actual_tokens_perc' : []},
85+
86+
87+
'leastlatency': {'latency': [], 'latency_lo': [], 'latency_hi': [],
88+
'throughput_prefill': [], 'throughput_decode': [],
89+
'throughput_prefill_lo': [], 'throughput_decode_lo': [],
90+
'throughput_prefill_hi': [], 'throughput_decode_hi': [],
91+
'ttft': [], 'ttft_lo': [], 'ttft_hi': [],
92+
'tpot': [], 'tpot_lo': [], 'tpot_hi': [],
93+
'target_pods_lo': [], 'target_pods_hi': [],
94+
'recompute_cnt' : [], 'recompute_cnt_hi' : [], 'recompute_cnt_lo' : [],
95+
'pct_below_latency_target_lo': [], 'pct_below_latency_target_hi': [], 'queue_time_lo': [], 'queue_time_hi': [],
96+
'tol_lat_time_lo': [], 'tol_lat_time_hi': [],
97+
'avg_prefill_queue_size' : [],
98+
'avg_pending_tokens_perc' : [],
99+
'avg_actual_tokens_perc' : []},
100+
101+
'least': {'latency': [], 'latency_lo': [], 'latency_hi': [],
102+
'throughput_prefill': [], 'throughput_decode': [],
103+
'throughput_prefill_lo': [], 'throughput_decode_lo': [],
104+
'throughput_prefill_hi': [], 'throughput_decode_hi': [],
105+
'ttft': [], 'ttft_lo': [], 'ttft_hi': [],
106+
'tpot': [], 'tpot_lo': [], 'tpot_hi': [],
107+
'target_pods_lo': [], 'target_pods_hi': [],
108+
'recompute_cnt' : [], 'recompute_cnt_hi' : [], 'recompute_cnt_lo' : [],
109+
'pct_below_latency_target_lo': [], 'pct_below_latency_target_hi': [], 'queue_time_lo': [], 'queue_time_hi': [],
110+
'tol_lat_time_lo': [], 'tol_lat_time_hi': [],
111+
'avg_prefill_queue_size' : [],
112+
'avg_pending_tokens_perc' : [],
113+
'avg_actual_tokens_perc' : []},
114+
115+
'random': {'latency': [], 'latency_lo': [], 'latency_hi': [],
116+
'throughput_prefill': [], 'throughput_decode': [],
117+
'throughput_prefill_lo': [], 'throughput_decode_lo': [],
118+
'throughput_prefill_hi': [], 'throughput_decode_hi': [],
119+
'ttft': [], 'ttft_lo': [], 'ttft_hi': [],
120+
'tpot': [], 'tpot_lo': [], 'tpot_hi': [],
121+
'target_pods_lo': [], 'target_pods_hi': [],
122+
'recompute_cnt' : [], 'recompute_cnt_hi' : [], 'recompute_cnt_lo' : [],
123+
'pct_below_latency_target_lo': [], 'pct_below_latency_target_hi': [], 'queue_time_lo': [], 'queue_time_hi': [],
124+
'tol_lat_time_lo': [], 'tol_lat_time_hi': [],
125+
'avg_prefill_queue_size' : [],
126+
'avg_pending_tokens_perc' : [],
127+
'avg_actual_tokens_perc' : []},
128+
129+
}
130+
131+
all_routing_types = [ "random", ]
132+
prompt_output_tuple = None
133+
134+
# Iterate over routing types
135+
for routing_type in all_routing_types:
136+
print(f'Routing Type: {routing_type}')
137+
138+
for i, _ in enumerate(rates_lo):
139+
req_dict = {}
140+
req_dict_prefill = {}
141+
SIM_DURATION = SIM_DURATIONS[i]
142+
print(f'Simulate with rate: for lo {rates_lo[i]} and for hi {rates_hi[i]} and routing type: {routing_type}')
143+
144+
# Simpy environment and LLM actors setup
145+
env = simpy.Environment()
146+
list_of_llmactors = [LLMActor(env, 1, id) for id in range(number_of_servers)]
147+
lb = LoadBalancer(env, number_of_servers=number_of_servers, list_of_llmactors=list_of_llmactors, req_dict_prefill=req_dict_prefill, req_dict=req_dict, messages_remaining_cnt=no_of_messages)
148+
lb.queueing_perc = queueing_perc
149+
150+
estimated_output_size = mean_output_size_1
151+
lb.process(rates_lo[i], lora_requested_lo, target_latency_list_lo, prefix_latency_list_lo, routing_type, prompt_output_tuple, mean_request_size_1, std_request_size_1, mean_output_size_1, std_output_size_1, estimated_output_size)
152+
env.run(until=SIM_DURATION)
153+
154+
# Completed requests
155+
completed_req = list(filter(lambda x: x.output_size_remaining == 0, req_dict.values()))
156+
completed_req_sorted = sorted(completed_req, key=lambda x: x.arrival_time)
157+
# Exclude the first 10% of requests based on end_decode_time
158+
exclude_count = int(0 * len(completed_req_sorted))
159+
# Filter out the first 10%
160+
filtered_req = completed_req_sorted[exclude_count:]
161+
162+
# Calculate ttft, tpot, latency, and throughput
163+
ttft_cur = np.mean([x.end_prefill_time - x.arrival_time for x in req_dict.values()])
164+
tpot_cur = np.mean([(x.end_decode_time - x.start_prefill_time) / (x.output_size - x.output_size_remaining) for x in req_dict.values()])
165+
latency_cur = np.mean([(x.end_decode_time - x.arrival_time) / (x.output_size - x.output_size_remaining) for x in filtered_req])
166+
estimated_latency_cur = np.mean([x.estimated_latency for x in filtered_req])
167+
recompute_cur = np.sum([x.recompute_count for x in filtered_req]) / len(filtered_req)
168+
tt = SIM_DURATION
169+
throughput_prefill_cur = np.sum([x.input_size for x in filtered_req]) / tt
170+
throughput_decode_cur = np.sum([max(0, x.output_size - x.output_size_remaining - 1) for x in filtered_req]) / tt
171+
172+
pending_tokens_at_arrival_perc = [x.pending_tokens_at_arrival_perc for x in completed_req]
173+
actual_tokens_at_arrival_perc = [x.actual_tokens_at_arrival_perc for x in completed_req]
174+
prefill_queue_size = [x.queue_size_before_prefill for x in completed_req]
175+
176+
# Store results for the current routing type
177+
results[routing_type]['latency'].append(latency_cur)
178+
results[routing_type]['throughput_prefill'].append(throughput_prefill_cur)
179+
results[routing_type]['throughput_decode'].append(throughput_decode_cur)
180+
results[routing_type]['ttft'].append(ttft_cur)
181+
results[routing_type]['tpot'].append(tpot_cur)
182+
results[routing_type]['recompute_cnt'].append(recompute_cur)
183+
results[routing_type]['avg_prefill_queue_size'].append(np.mean(prefill_queue_size))
184+
results[routing_type]['avg_pending_tokens_perc'].append(np.mean(pending_tokens_at_arrival_perc))
185+
results[routing_type]['avg_actual_tokens_perc'].append(np.mean(actual_tokens_at_arrival_perc))
186+
187+
# Create a timestamp
188+
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
189+
# Create the output file name with the timestamp
190+
output_file = f"results_{timestamp}.csv"
191+
192+
# Write results to CSV
193+
with open(output_file, 'w', newline='') as csvfile:
194+
fieldnames = ['RoutingType', 'RateIndex', 'Latency', 'avg_prefill_queue_size', 'avg_pending_tokens_perc', 'avg_actual_tokens_perc']
195+
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
196+
writer.writeheader()
197+
198+
# Iterate over routing types and write each entry
199+
for routing_type in all_routing_types:
200+
for i in range(len(rates_lo)):
201+
writer.writerow({
202+
'RoutingType': routing_type,
203+
'RateIndex': rates_lo[i],
204+
'Latency': results[routing_type]['latency'][i],
205+
'avg_prefill_queue_size': results[routing_type]['avg_prefill_queue_size'][i],
206+
'avg_pending_tokens_perc': results[routing_type]['avg_pending_tokens_perc'][i],
207+
'avg_actual_tokens_perc': results[routing_type]['avg_actual_tokens_perc'][i],
208+
})
209+
210+
print(f"Results have been saved to {output_file}")
211+
212+
if __name__ == "__main__":
213+
main()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
PREFILL_LATENCY_CONST_2 = 0
2+
PREFILL_LATENCY_CONST_1 = 0.00006769375513
3+
PREFILL_LATENCY_CONST_0 = 0.01969
4+
PREFILL_LATENCY_CONST_MIN = 0.04
5+
6+
DECODE_LATENCY_CONST_BATCH = 0.0001026494433
7+
DECODE_LATENCY_CONST_1 = 0.0000005353485087
8+
DECODE_LATENCY_CONST_0 = 0.014
9+
TOKENIZE_LATENCY_CONST = 0
10+
11+
MAX_NUM_BATCH_TOKENS = 512 # in prefill
12+
13+
TOTAL_NUM_GPU_BLOCKS = 2810
14+
NUMBER_OF_TOKENS_PER_BLOCK = 16
15+
MAX_NUM_TOKENS_ALLOWED = TOTAL_NUM_GPU_BLOCKS * NUMBER_OF_TOKENS_PER_BLOCK - MAX_NUM_BATCH_TOKENS # in kv cache
16+
MAX_GPU_MEMORY_PERC_BEFORE_RECOMPUTE = 0.9
17+
MAX_GPU_MEMORY_PERC_BEFORE_RECOMPUTE_NON_CRITICAL = 0.8
18+
MAX_NUM_SEQ = 256
19+
20+
# size of each lora in units of KV Cache
21+
LORA_DICT = {"tweet": 1600, "sql": 1600, "dummy-1": 0, "dummy-2": 0}

0 commit comments

Comments
 (0)