|
| 1 | +import argparse |
| 2 | +from collections import Counter |
| 3 | +import csv |
| 4 | +from datetime import datetime |
| 5 | +import numpy as np |
| 6 | +import simpy |
| 7 | +from llmactor import LLMActor |
| 8 | +from loadbalancer import LoadBalancer |
| 9 | + |
| 10 | +def main(): |
| 11 | + parser = argparse.ArgumentParser(description="Simulate LLM load balancing with configurable parameters.") |
| 12 | + parser.add_argument("--rates_lo", nargs='+', type=int, default=[35, 30, 25, 20, 15, 10, 5, 1], help="List of low rates.") |
| 13 | + parser.add_argument("--rates_hi", nargs='+', type=int, default=[35, 30, 25, 20, 15, 10, 5, 1], help="List of high rates.") |
| 14 | + parser.add_argument("--no_of_messages", type=int, default=2500, help="Number of messages to simulate.") |
| 15 | + parser.add_argument("--mean_request_size_1", type=int, default=202, help="Mean request size for set 1.") |
| 16 | + parser.add_argument("--std_request_size_1", type=int, default=20, help="Standard deviation of request size for set 1.") |
| 17 | + parser.add_argument("--mean_output_size_1", type=int, default=179, help="Mean output size for set 1.") |
| 18 | + parser.add_argument("--std_output_size_1", type=int, default=17, help="Standard deviation of output size for set 1.") |
| 19 | + parser.add_argument("--mean_request_size_2", type=int, default=202, help="Mean request size for set 2.") |
| 20 | + parser.add_argument("--std_request_size_2", type=int, default=20, help="Standard deviation of request size for set 2.") |
| 21 | + parser.add_argument("--mean_output_size_2", type=int, default=179, help="Mean output size for set 2.") |
| 22 | + parser.add_argument("--std_output_size_2", type=int, default=17, help="Standard deviation of output size for set 2.") |
| 23 | + parser.add_argument("--queueing_perc", type=float, default=np.inf, help="Queueing percentage.") |
| 24 | + parser.add_argument('--target-latency-lo', nargs='+', type=float, help='List of target latencies for low priority requests.') |
| 25 | + parser.add_argument('--target-latency-hi', nargs='+', type=float, help='List of target latencies for high priority requests.') |
| 26 | + parser.add_argument('--prefix-latency-lo', nargs='+', type=float, help='List of prefix of target latencies for low priority requests.') |
| 27 | + parser.add_argument('--prefix-latency-hi', nargs='+', type=float, help='List of prefix of target latencies for high priority requests.') |
| 28 | + parser.add_argument('--number-of-servers', type=int, default=1, help='List of target latencies for high priority requests.') |
| 29 | + |
| 30 | + args = parser.parse_args() |
| 31 | + |
| 32 | + # Use provided arguments or defaults |
| 33 | + rates_lo = args.rates_lo |
| 34 | + rates_hi = args.rates_hi |
| 35 | + no_of_messages = args.no_of_messages |
| 36 | + SIM_DURATIONS = [no_of_messages / r + 100 for r in rates_lo] |
| 37 | + mean_request_size_1 = args.mean_request_size_1 |
| 38 | + std_request_size_1 = args.std_request_size_1 |
| 39 | + mean_output_size_1 = args.mean_output_size_1 |
| 40 | + std_output_size_1 = args.std_output_size_1 |
| 41 | + mean_request_size_2 = args.mean_request_size_2 |
| 42 | + std_request_size_2 = args.std_request_size_2 |
| 43 | + mean_output_size_2 = args.mean_output_size_2 |
| 44 | + std_output_size_2 = args.std_output_size_2 |
| 45 | + queueing_perc = args.queueing_perc |
| 46 | + lora_requested_lo = "" |
| 47 | + lora_requested_hi = "" |
| 48 | + target_latency_list_lo = args.target_latency_lo if args.target_latency_lo else [0.025] |
| 49 | + target_latency_list_hi = args.target_latency_hi if args.target_latency_hi else [0.5] |
| 50 | + prefix_latency_list_lo = args.prefix_latency_lo if args.prefix_latency_lo else ['lo'] |
| 51 | + prefix_latency_list_hi = args.prefix_latency_hi if args.prefix_latency_hi else ['hi'] |
| 52 | + number_of_servers = args.number_of_servers |
| 53 | + |
| 54 | + # Define a structure to store results for all routing types |
| 55 | + results = { |
| 56 | + 'leastPseudo': {'latency': [], 'latency_lo': [], 'latency_hi': [], |
| 57 | + 'throughput_prefill': [], 'throughput_decode': [], |
| 58 | + 'throughput_prefill_lo': [], 'throughput_decode_lo': [], |
| 59 | + 'throughput_prefill_hi': [], 'throughput_decode_hi': [], |
| 60 | + 'ttft': [], 'ttft_lo': [], 'ttft_hi': [], |
| 61 | + 'tpot': [], 'tpot_lo': [], 'tpot_hi': [], |
| 62 | + 'target_pods_lo': [], 'target_pods_hi': [], |
| 63 | + 'recompute_cnt' : [], 'recompute_cnt_hi' : [], 'recompute_cnt_lo' : [], |
| 64 | + 'pct_below_latency_target_lo': [], 'pct_below_latency_target_hi': [], 'queue_time_lo': [], 'queue_time_hi': [], |
| 65 | + 'tol_lat_time_lo': [], 'tol_lat_time_hi': [], |
| 66 | + 'avg_prefill_queue_size' : [], |
| 67 | +'avg_pending_tokens_perc' : [], |
| 68 | +'avg_actual_tokens_perc' : []}, |
| 69 | + |
| 70 | + 'smart': {'latency': [], 'latency_lo': [], 'latency_hi': [], |
| 71 | + 'estimated_latency': [], 'estimated_latency_lo': [], 'estimated_latency_hi': [], |
| 72 | + 'throughput_prefill': [], 'throughput_decode': [], |
| 73 | + 'throughput_prefill_lo': [], 'throughput_decode_lo': [], |
| 74 | + 'throughput_prefill_hi': [], 'throughput_decode_hi': [], |
| 75 | + 'ttft': [], 'ttft_lo': [], 'ttft_hi': [], |
| 76 | + 'tpot': [], 'tpot_lo': [], 'tpot_hi': [], |
| 77 | + 'target_pods_lo': [], 'target_pods_hi': [], |
| 78 | + 'recompute_cnt' : [], 'recompute_cnt_hi' : [], 'recompute_cnt_lo' : [], |
| 79 | + 'pct_below_latency_target_lo': [], 'pct_below_latency_target_hi': [], |
| 80 | + 'pct_below_latency_target_lo': [], 'pct_below_latency_target_hi': [], 'queue_time_lo': [], 'queue_time_hi': [], |
| 81 | + 'tol_lat_time_lo': [], 'tol_lat_time_hi': [], |
| 82 | + 'avg_prefill_queue_size' : [], |
| 83 | +'avg_pending_tokens_perc' : [], |
| 84 | +'avg_actual_tokens_perc' : []}, |
| 85 | + |
| 86 | + |
| 87 | + 'leastlatency': {'latency': [], 'latency_lo': [], 'latency_hi': [], |
| 88 | + 'throughput_prefill': [], 'throughput_decode': [], |
| 89 | + 'throughput_prefill_lo': [], 'throughput_decode_lo': [], |
| 90 | + 'throughput_prefill_hi': [], 'throughput_decode_hi': [], |
| 91 | + 'ttft': [], 'ttft_lo': [], 'ttft_hi': [], |
| 92 | + 'tpot': [], 'tpot_lo': [], 'tpot_hi': [], |
| 93 | + 'target_pods_lo': [], 'target_pods_hi': [], |
| 94 | + 'recompute_cnt' : [], 'recompute_cnt_hi' : [], 'recompute_cnt_lo' : [], |
| 95 | + 'pct_below_latency_target_lo': [], 'pct_below_latency_target_hi': [], 'queue_time_lo': [], 'queue_time_hi': [], |
| 96 | + 'tol_lat_time_lo': [], 'tol_lat_time_hi': [], |
| 97 | + 'avg_prefill_queue_size' : [], |
| 98 | +'avg_pending_tokens_perc' : [], |
| 99 | +'avg_actual_tokens_perc' : []}, |
| 100 | + |
| 101 | + 'least': {'latency': [], 'latency_lo': [], 'latency_hi': [], |
| 102 | + 'throughput_prefill': [], 'throughput_decode': [], |
| 103 | + 'throughput_prefill_lo': [], 'throughput_decode_lo': [], |
| 104 | + 'throughput_prefill_hi': [], 'throughput_decode_hi': [], |
| 105 | + 'ttft': [], 'ttft_lo': [], 'ttft_hi': [], |
| 106 | + 'tpot': [], 'tpot_lo': [], 'tpot_hi': [], |
| 107 | + 'target_pods_lo': [], 'target_pods_hi': [], |
| 108 | + 'recompute_cnt' : [], 'recompute_cnt_hi' : [], 'recompute_cnt_lo' : [], |
| 109 | + 'pct_below_latency_target_lo': [], 'pct_below_latency_target_hi': [], 'queue_time_lo': [], 'queue_time_hi': [], |
| 110 | + 'tol_lat_time_lo': [], 'tol_lat_time_hi': [], |
| 111 | + 'avg_prefill_queue_size' : [], |
| 112 | +'avg_pending_tokens_perc' : [], |
| 113 | +'avg_actual_tokens_perc' : []}, |
| 114 | + |
| 115 | + 'random': {'latency': [], 'latency_lo': [], 'latency_hi': [], |
| 116 | + 'throughput_prefill': [], 'throughput_decode': [], |
| 117 | + 'throughput_prefill_lo': [], 'throughput_decode_lo': [], |
| 118 | + 'throughput_prefill_hi': [], 'throughput_decode_hi': [], |
| 119 | + 'ttft': [], 'ttft_lo': [], 'ttft_hi': [], |
| 120 | + 'tpot': [], 'tpot_lo': [], 'tpot_hi': [], |
| 121 | + 'target_pods_lo': [], 'target_pods_hi': [], |
| 122 | + 'recompute_cnt' : [], 'recompute_cnt_hi' : [], 'recompute_cnt_lo' : [], |
| 123 | + 'pct_below_latency_target_lo': [], 'pct_below_latency_target_hi': [], 'queue_time_lo': [], 'queue_time_hi': [], |
| 124 | + 'tol_lat_time_lo': [], 'tol_lat_time_hi': [], |
| 125 | + 'avg_prefill_queue_size' : [], |
| 126 | +'avg_pending_tokens_perc' : [], |
| 127 | +'avg_actual_tokens_perc' : []}, |
| 128 | + |
| 129 | +} |
| 130 | + |
| 131 | + all_routing_types = [ "random", ] |
| 132 | + prompt_output_tuple = None |
| 133 | + |
| 134 | +# Iterate over routing types |
| 135 | + for routing_type in all_routing_types: |
| 136 | + print(f'Routing Type: {routing_type}') |
| 137 | + |
| 138 | + for i, _ in enumerate(rates_lo): |
| 139 | + req_dict = {} |
| 140 | + req_dict_prefill = {} |
| 141 | + SIM_DURATION = SIM_DURATIONS[i] |
| 142 | + print(f'Simulate with rate: for lo {rates_lo[i]} and for hi {rates_hi[i]} and routing type: {routing_type}') |
| 143 | + |
| 144 | + # Simpy environment and LLM actors setup |
| 145 | + env = simpy.Environment() |
| 146 | + list_of_llmactors = [LLMActor(env, 1, id) for id in range(number_of_servers)] |
| 147 | + lb = LoadBalancer(env, number_of_servers=number_of_servers, list_of_llmactors=list_of_llmactors, req_dict_prefill=req_dict_prefill, req_dict=req_dict, messages_remaining_cnt=no_of_messages) |
| 148 | + lb.queueing_perc = queueing_perc |
| 149 | + |
| 150 | + estimated_output_size = mean_output_size_1 |
| 151 | + lb.process(rates_lo[i], lora_requested_lo, target_latency_list_lo, prefix_latency_list_lo, routing_type, prompt_output_tuple, mean_request_size_1, std_request_size_1, mean_output_size_1, std_output_size_1, estimated_output_size) |
| 152 | + env.run(until=SIM_DURATION) |
| 153 | + |
| 154 | + # Completed requests |
| 155 | + completed_req = list(filter(lambda x: x.output_size_remaining == 0, req_dict.values())) |
| 156 | + completed_req_sorted = sorted(completed_req, key=lambda x: x.arrival_time) |
| 157 | + # Exclude the first 10% of requests based on end_decode_time |
| 158 | + exclude_count = int(0 * len(completed_req_sorted)) |
| 159 | + # Filter out the first 10% |
| 160 | + filtered_req = completed_req_sorted[exclude_count:] |
| 161 | + |
| 162 | + # Calculate ttft, tpot, latency, and throughput |
| 163 | + ttft_cur = np.mean([x.end_prefill_time - x.arrival_time for x in req_dict.values()]) |
| 164 | + tpot_cur = np.mean([(x.end_decode_time - x.start_prefill_time) / (x.output_size - x.output_size_remaining) for x in req_dict.values()]) |
| 165 | + latency_cur = np.mean([(x.end_decode_time - x.arrival_time) / (x.output_size - x.output_size_remaining) for x in filtered_req]) |
| 166 | + estimated_latency_cur = np.mean([x.estimated_latency for x in filtered_req]) |
| 167 | + recompute_cur = np.sum([x.recompute_count for x in filtered_req]) / len(filtered_req) |
| 168 | + tt = SIM_DURATION |
| 169 | + throughput_prefill_cur = np.sum([x.input_size for x in filtered_req]) / tt |
| 170 | + throughput_decode_cur = np.sum([max(0, x.output_size - x.output_size_remaining - 1) for x in filtered_req]) / tt |
| 171 | + |
| 172 | + pending_tokens_at_arrival_perc = [x.pending_tokens_at_arrival_perc for x in completed_req] |
| 173 | + actual_tokens_at_arrival_perc = [x.actual_tokens_at_arrival_perc for x in completed_req] |
| 174 | + prefill_queue_size = [x.queue_size_before_prefill for x in completed_req] |
| 175 | + |
| 176 | + # Store results for the current routing type |
| 177 | + results[routing_type]['latency'].append(latency_cur) |
| 178 | + results[routing_type]['throughput_prefill'].append(throughput_prefill_cur) |
| 179 | + results[routing_type]['throughput_decode'].append(throughput_decode_cur) |
| 180 | + results[routing_type]['ttft'].append(ttft_cur) |
| 181 | + results[routing_type]['tpot'].append(tpot_cur) |
| 182 | + results[routing_type]['recompute_cnt'].append(recompute_cur) |
| 183 | + results[routing_type]['avg_prefill_queue_size'].append(np.mean(prefill_queue_size)) |
| 184 | + results[routing_type]['avg_pending_tokens_perc'].append(np.mean(pending_tokens_at_arrival_perc)) |
| 185 | + results[routing_type]['avg_actual_tokens_perc'].append(np.mean(actual_tokens_at_arrival_perc)) |
| 186 | + |
| 187 | + # Create a timestamp |
| 188 | + timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") |
| 189 | + # Create the output file name with the timestamp |
| 190 | + output_file = f"results_{timestamp}.csv" |
| 191 | + |
| 192 | + # Write results to CSV |
| 193 | + with open(output_file, 'w', newline='') as csvfile: |
| 194 | + fieldnames = ['RoutingType', 'RateIndex', 'Latency', 'avg_prefill_queue_size', 'avg_pending_tokens_perc', 'avg_actual_tokens_perc'] |
| 195 | + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) |
| 196 | + writer.writeheader() |
| 197 | + |
| 198 | + # Iterate over routing types and write each entry |
| 199 | + for routing_type in all_routing_types: |
| 200 | + for i in range(len(rates_lo)): |
| 201 | + writer.writerow({ |
| 202 | + 'RoutingType': routing_type, |
| 203 | + 'RateIndex': rates_lo[i], |
| 204 | + 'Latency': results[routing_type]['latency'][i], |
| 205 | + 'avg_prefill_queue_size': results[routing_type]['avg_prefill_queue_size'][i], |
| 206 | + 'avg_pending_tokens_perc': results[routing_type]['avg_pending_tokens_perc'][i], |
| 207 | + 'avg_actual_tokens_perc': results[routing_type]['avg_actual_tokens_perc'][i], |
| 208 | + }) |
| 209 | + |
| 210 | + print(f"Results have been saved to {output_file}") |
| 211 | + |
| 212 | +if __name__ == "__main__": |
| 213 | + main() |
0 commit comments