Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extract generation_manager from tokenizer_manager #3115

Open
wants to merge 55 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
55 commits
Select commit Hold shift + click to select a range
a2f2f48
empty file
fzyzcjy Jan 25, 2025
815dbc3
empty class
fzyzcjy Jan 25, 2025
3c0e52f
mv MetricManager
fzyzcjy Jan 25, 2025
65b3a37
fix
fzyzcjy Jan 25, 2025
6ce5236
mv _ReqState
fzyzcjy Jan 25, 2025
7ca0a47
mv GenerationConverter.init
fzyzcjy Jan 25, 2025
b88e450
mv tokenize_request
fzyzcjy Jan 25, 2025
3b8ed7b
simp branch
fzyzcjy Jan 25, 2025
2f47f92
tokenize_requests
fzyzcjy Jan 25, 2025
e21a05e
mv postprocess_response
fzyzcjy Jan 25, 2025
ab5d79a
simp code
fzyzcjy Jan 25, 2025
053c8f4
extract _compute_meta_info
fzyzcjy Jan 25, 2025
02c451c
mv convert_logprob_style etc
fzyzcjy Jan 25, 2025
ccd5e8a
make private
fzyzcjy Jan 25, 2025
ecf5e21
mv GenerationManager.init
fzyzcjy Jan 25, 2025
818f8cd
mv GenerationManager body
fzyzcjy Jan 25, 2025
022eb4f
fix import
fzyzcjy Jan 25, 2025
dc53f8f
mv modelconfig
fzyzcjy Jan 25, 2025
c4f1668
call generation_converter
fzyzcjy Jan 25, 2025
1670ce1
fix metrics
fzyzcjy Jan 25, 2025
905d247
fix err
fzyzcjy Jan 25, 2025
41bee7d
handle tokenizer_manager.generate_request
fzyzcjy Jan 25, 2025
2b3ca96
handle abort_request
fzyzcjy Jan 25, 2025
e293f1f
add field
fzyzcjy Jan 25, 2025
2424cf2
rm empty func
fzyzcjy Jan 25, 2025
422ea33
extract _RequestDumper
fzyzcjy Jan 25, 2025
3e6e363
call setup
fzyzcjy Jan 25, 2025
56dcbd1
call handle_batch_output
fzyzcjy Jan 25, 2025
0c08f30
more tokenizer_manager call generation_manager
fzyzcjy Jan 25, 2025
deec6af
use property
fzyzcjy Jan 25, 2025
43dd4e2
call request_dumper
fzyzcjy Jan 25, 2025
2d09b58
call on_request
fzyzcjy Jan 25, 2025
5701e20
fix minor field names
fzyzcjy Jan 25, 2025
cff89f0
fix more field names
fzyzcjy Jan 25, 2025
ba0f1b1
more
fzyzcjy Jan 25, 2025
4b03255
extract _RequestLogger
fzyzcjy Jan 25, 2025
75dc737
extract logger body
fzyzcjy Jan 25, 2025
4100d60
fix err
fzyzcjy Jan 25, 2025
ba4ad8e
fix field
fzyzcjy Jan 25, 2025
9080d45
fmt
fzyzcjy Jan 25, 2025
b1932a6
handle max_req_input_len
fzyzcjy Jan 25, 2025
559ecba
fmt
fzyzcjy Jan 25, 2025
8a10a42
Merge branch 'main' into feat/generation_manager
fzyzcjy Jan 25, 2025
45937e6
Merge branch 'main' into feat/generation_manager
fzyzcjy Jan 26, 2025
cfd3852
Merge branch 'main' into feat/generation_manager
fzyzcjy Jan 26, 2025
4543136
bump ci
fzyzcjy Jan 26, 2025
5e16f96
Merge remote-tracking branch 'origin/feat/generation_manager' into fe…
fzyzcjy Jan 26, 2025
b761936
Merge branch 'main' into feat/generation_manager
fzyzcjy Jan 26, 2025
aeed015
Revert "bump ci"
fzyzcjy Jan 26, 2025
c42431b
Merge remote-tracking branch 'origin/feat/generation_manager' into fe…
fzyzcjy Jan 26, 2025
03b5799
Merge branch 'main' into feat/generation_manager
fzyzcjy Jan 26, 2025
2588e23
Merge branch 'main' into feat/generation_manager
fzyzcjy Jan 26, 2025
7daa570
Merge branch 'main' into feat/generation_manager
fzyzcjy Jan 27, 2025
bbd7908
bump ci
fzyzcjy Jan 26, 2025
e669e45
Revert "bump ci"
fzyzcjy Jan 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
mv MetricManager
  • Loading branch information
fzyzcjy committed Jan 25, 2025
commit 3c0e52f7214ce7ce4bb4284006df9c2ad88ac41a
65 changes: 64 additions & 1 deletion python/sglang/srt/managers/generation_manager.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
import dataclasses
import time
from typing import Optional

from sglang.srt.metrics.collector import TokenizerMetricsCollector
from sglang.srt.server_args import ServerArgs


class GenerationManager:
pass

Expand All @@ -7,4 +15,59 @@ class GenerationConverter:


class _MetricManager:
pass
def __init__(self, server_args: ServerArgs):
self.metrics_collector = TokenizerMetricsCollector(
labels={
"model_name": server_args.served_model_name,
# TODO: Add lora name/path in the future,
},
)

def handle_batch_output_metrics(
self,
recv_obj,
i: int,
state: "_MetricReqState",
finished: bool,
stream: Optional[bool],
):
completion_tokens = (
recv_obj.completion_tokens[i]
if getattr(recv_obj, "completion_tokens", None)
else 0
)

if state.first_token_time is None:
state.first_token_time = time.time()
self.metrics_collector.observe_time_to_first_token(
state.first_token_time - state.created_time
)
else:
if completion_tokens >= 2:
# Compute time_per_output_token for the streaming case
self.metrics_collector.observe_time_per_output_token(
(time.time() - state.first_token_time) / (completion_tokens - 1)
)

if state.finished:
self.metrics_collector.observe_one_finished_request(
recv_obj.prompt_tokens[i], completion_tokens
)
self.metrics_collector.observe_e2e_request_latency(
time.time() - state.created_time
)
# Compute time_per_output_token for the non-streaming case
if (
hasattr(state.obj, "stream")
and not state.obj.stream
and completion_tokens >= 1
):
self.metrics_collector.observe_time_per_output_token(
(time.time() - state.created_time) / completion_tokens
)


@dataclasses.dataclass
class _MetricReqState:
created_time: float
first_token_time: Optional[float] = None
46 changes: 2 additions & 44 deletions python/sglang/srt/managers/tokenizer_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,10 +96,6 @@ class ReqState:
event: asyncio.Event
obj: Any

# For metrics
created_time: float
first_token_time: Optional[float] = None

# For streaming output
last_output_offset: int = 0

Expand Down Expand Up @@ -217,12 +213,7 @@ def __init__(

# Metrics
if self.enable_metrics:
self.metrics_collector = TokenizerMetricsCollector(
labels={
"model_name": self.server_args.served_model_name,
# TODO: Add lora name/path in the future,
},
)
TODO_moved

self._result_dispatcher = TypeBasedDispatcher(
[
Expand Down Expand Up @@ -886,40 +877,7 @@ def detokenize_top_logprobs_tokens(
return ret

def collect_metrics(self, state: ReqState, recv_obj: BatchStrOut, i: int):
completion_tokens = (
recv_obj.completion_tokens[i]
if getattr(recv_obj, "completion_tokens", None)
else 0
)

if state.first_token_time is None:
state.first_token_time = time.time()
self.metrics_collector.observe_time_to_first_token(
state.first_token_time - state.created_time
)
else:
if completion_tokens >= 2:
# Compute time_per_output_token for the streaming case
self.metrics_collector.observe_time_per_output_token(
(time.time() - state.first_token_time) / (completion_tokens - 1)
)

if state.finished:
self.metrics_collector.observe_one_finished_request(
recv_obj.prompt_tokens[i], completion_tokens
)
self.metrics_collector.observe_e2e_request_latency(
time.time() - state.created_time
)
# Compute time_per_output_token for the non-streaming case
if (
hasattr(state.obj, "stream")
and not state.obj.stream
and completion_tokens >= 1
):
self.metrics_collector.observe_time_per_output_token(
(time.time() - state.created_time) / completion_tokens
)
TODO_moved

def dump_requests(self, state: ReqState, out_dict: dict):
self.dump_request_list.append(
Expand Down