diff --git a/docs/release_process.md b/docs/release_process.md new file mode 100644 index 000000000..1bab0fd14 --- /dev/null +++ b/docs/release_process.md @@ -0,0 +1,8 @@ +``` +pip install build twine +``` + +``` +cd python +bash upload_pypi.sh +``` \ No newline at end of file diff --git a/docs/test_process.md b/docs/test_process.md index fcb03ad6f..1c1bc0f2c 100644 --- a/docs/test_process.md +++ b/docs/test_process.md @@ -81,3 +81,9 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port cd test/lang python3 run_all.py ``` + +## OpenAI API server +``` +cd test/srt +python test_openai_server.py +``` \ No newline at end of file diff --git a/playground/load_tokenizer.py b/playground/load_tokenizer.py index 33a6a700d..1429e06e3 100644 --- a/playground/load_tokenizer.py +++ b/playground/load_tokenizer.py @@ -1,7 +1,8 @@ import transformers import code -name = "meta-llama/Llama-2-7b-chat-hf" +#name = "meta-llama/Llama-2-7b-chat-hf" +name = "meta-llama/Meta-Llama-3-8B-Instruct" t = transformers.AutoTokenizer.from_pretrained(name) code.interact(local=locals()) diff --git a/python/sglang/srt/managers/router/model_rpc.py b/python/sglang/srt/managers/router/model_rpc.py index 19434911d..ea1b7ae5e 100644 --- a/python/sglang/srt/managers/router/model_rpc.py +++ b/python/sglang/srt/managers/router/model_rpc.py @@ -426,7 +426,9 @@ class ModelRpcServer: # Only transfer the selected logprobs of the next token to CPU to reduce overhead. if last_logprobs is not None: last_token_logprobs = ( - last_logprobs[torch.arange(len(batch.reqs)), next_token_ids].tolist() + last_logprobs[ + torch.arange(len(batch.reqs), device=next_token_ids.device), + next_token_ids].tolist() ) next_token_ids = next_token_ids.tolist() @@ -587,6 +589,7 @@ class ModelRpcServer: - req.prompt_tokens, "completion_tokens_wo_jump_forward": req.completion_tokens_wo_jump_forward, "finish_reason": str(req.finish_reason), # FIXME: convert to the correct string + "hit_stop_str": req.hit_stop_str, } if req.return_logprob: ( diff --git a/python/sglang/srt/managers/router/model_runner.py b/python/sglang/srt/managers/router/model_runner.py index 565e1beac..65363a386 100644 --- a/python/sglang/srt/managers/router/model_runner.py +++ b/python/sglang/srt/managers/router/model_runner.py @@ -110,8 +110,8 @@ class InputMetadata: self.kv_last_page_len = torch.ones( (self.batch_size,), dtype=torch.int32, device="cuda" ) - req_pool_indices_cpu = self.req_pool_indices.cpu().tolist() - seq_lens_cpu = self.seq_lens.tolist() + req_pool_indices_cpu = self.req_pool_indices.cpu().numpy() + seq_lens_cpu = self.seq_lens.cpu().numpy() self.kv_indices = torch.cat( [ self.req_to_token_pool.req_to_token[ diff --git a/test/srt/test_openai_server.py b/test/srt/test_openai_server.py index 569eaccb7..a77319b1b 100644 --- a/test/srt/test_openai_server.py +++ b/test/srt/test_openai_server.py @@ -163,7 +163,7 @@ def test_regex(args): regex = ( r"""\{\n""" + r""" "name": "[\w]+",\n""" - + r""" "population": [\w\d\s]+\n""" + + r""" "population": [\d]+\n""" + r"""\}""" )