Cleanup readme, llava examples, usage examples and nccl init (#1194)

2024-08-24 08:02:23 -07:00
parent c9064e6fd9
commit f6af3a6561
65 changed files with 174 additions and 317 deletions
--- a/test/srt/models/test_embedding_models.py
+++ b/test/srt/models/test_embedding_models.py
@@ -59,7 +59,7 @@ class TestEmbeddingModels(unittest.TestCase):
                tolerance = 1e-2
                assert torch.all(
                    abs(similarities - 1) < tolerance
-                ), f"embeddings not all close"
+                ), "embeddings are not all close"

    def test_prefill_logits(self):
        for model, tp_size in MODELS:
--- a/test/srt/models/test_generation_models.py
+++ b/test/srt/models/test_generation_models.py
@@ -59,7 +59,7 @@ class TestGenerationModels(unittest.TestCase):
                tolerance = 3e-2
                assert torch.all(
                    abs(hf_logprobs - srt_logprobs) < tolerance
-                ), f"prefill logprobs not all close"
+                ), "prefill logprobs are not all close"

        print(hf_outputs.output_strs)
        print(srt_outputs.output_strs)
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -14,7 +14,7 @@ suites = {
        "test_torch_compile.py",
        "test_triton_attn_backend.py",
        "test_vision_openai_server.py",
-        "test_large_max_new_tokens.py",
+        "test_update_weights.py",
        "models/test_generation_models.py",
        "models/test_embedding_models.py",
        "sampling/penaltylib",
--- a/test/srt/test_vision_openai_server.py
+++ b/test/srt/test_vision_openai_server.py
@@ -2,8 +2,6 @@ import base64
 import io
 import json
 import os
-import sys
-import time
 import unittest

 import numpy as np
@@ -12,12 +10,10 @@ import requests
 from decord import VideoReader, cpu
 from PIL import Image

-from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.utils import kill_child_process
 from sglang.test.test_utils import DEFAULT_URL_FOR_UNIT_TEST, popen_launch_server


-# python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --tokenizer-path lmms-lab/llavanext-qwen-siglip-tokenizer --port=30000 --host=127.0.0.1 --tp-size=8 --chat-template=chatml-llava --chunked-prefill-size=16384
 class TestOpenAIVisionServer(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
@@ -32,11 +28,9 @@ class TestOpenAIVisionServer(unittest.TestCase):
            other_args=[
                "--chat-template",
                "chatml-llava",
-                "--tokenizer-path",
-                "lmms-lab/llavanext-qwen-siglip-tokenizer",
                "--chunked-prefill-size",
                "16384",
-                "--log-requests",
+                # "--log-requests",
            ],
        )
        cls.base_url += "/v1"
@@ -132,7 +126,6 @@ class TestOpenAIVisionServer(unittest.TestCase):

        messages = self.prepare_video_messages(file_path)

-        start_time = time.time()
        video_request = client.chat.completions.create(
            model="default",
            messages=messages,
@@ -140,15 +133,14 @@ class TestOpenAIVisionServer(unittest.TestCase):
            max_tokens=1024,
            stream=True,
        )
+
        print("-" * 30)
        video_response = ""
-
        for chunk in video_request:
            if chunk.choices[0].delta.content is not None:
                content = chunk.choices[0].delta.content
                video_response += content
-                sys.stdout.write(content)
-                sys.stdout.flush()
+                print(content, end="", flush=True)
        print("-" * 30)

        # Add assertions to validate the video response