diff --git a/docs/backend/native_api.ipynb b/docs/backend/native_api.ipynb index b9ede4263..b7d90a477 100644 --- a/docs/backend/native_api.ipynb +++ b/docs/backend/native_api.ipynb @@ -49,7 +49,7 @@ "\n", "\n", "server_process, port = launch_server_cmd(\n", - " \"python -m sglang.launch_server --model-path meta-llama/Llama-3.2-1B-Instruct --host 0.0.0.0\"\n", + " \"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0\"\n", ")\n", "\n", "wait_for_server(f\"http://localhost:{port}\")" @@ -105,9 +105,9 @@ "response = requests.get(url)\n", "response_json = response.json()\n", "print_highlight(response_json)\n", - "assert response_json[\"model_path\"] == \"meta-llama/Llama-3.2-1B-Instruct\"\n", + "assert response_json[\"model_path\"] == \"qwen/qwen2.5-0.5b-instruct\"\n", "assert response_json[\"is_generation\"] is True\n", - "assert response_json[\"tokenizer_path\"] == \"meta-llama/Llama-3.2-1B-Instruct\"\n", + "assert response_json[\"tokenizer_path\"] == \"qwen/qwen2.5-0.5b-instruct\"\n", "assert response_json.keys() == {\"model_path\", \"is_generation\", \"tokenizer_path\"}" ] }, @@ -213,7 +213,7 @@ "# successful update with same architecture and size\n", "\n", "url = f\"http://localhost:{port}/update_weights_from_disk\"\n", - "data = {\"model_path\": \"meta-llama/Llama-3.2-1B\"}\n", + "data = {\"model_path\": \"qwen/qwen2.5-0.5b-instruct\"}\n", "\n", "response = requests.post(url, json=data)\n", "print_highlight(response.text)\n", @@ -230,7 +230,7 @@ "# failed update with different parameter size or wrong name\n", "\n", "url = f\"http://localhost:{port}/update_weights_from_disk\"\n", - "data = {\"model_path\": \"meta-llama/Llama-3.2-1B-wrong\"}\n", + "data = {\"model_path\": \"qwen/qwen2.5-0.5b-instruct-wrong\"}\n", "\n", "response = requests.post(url, json=data)\n", "response_json = response.json()\n", @@ -238,11 +238,20 @@ "assert response_json[\"success\"] is False\n", "assert response_json[\"message\"] == (\n", " \"Failed to get weights iterator: \"\n", - " \"meta-llama/Llama-3.2-1B-wrong\"\n", + " \"qwen/qwen2.5-0.5b-instruct-wrong\"\n", " \" (repository not found).\"\n", ")" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "terminate_process(server_process)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -259,11 +268,9 @@ "metadata": {}, "outputs": [], "source": [ - "terminate_process(server_process)\n", - "\n", "embedding_process, port = launch_server_cmd(\n", " \"\"\"\n", - "python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct \\\n", + "python3 -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-1.5B-instruct \\\n", " --host 0.0.0.0 --is-embedding\n", "\"\"\"\n", ")\n", @@ -280,7 +287,7 @@ "# successful encode for embedding model\n", "\n", "url = f\"http://localhost:{port}/encode\"\n", - "data = {\"model\": \"Alibaba-NLP/gte-Qwen2-7B-instruct\", \"text\": \"Once upon a time\"}\n", + "data = {\"model\": \"Alibaba-NLP/gte-Qwen2-1.5B-instruct\", \"text\": \"Once upon a time\"}\n", "\n", "response = requests.post(url, json=data)\n", "response_json = response.json()\n", @@ -318,7 +325,7 @@ "\n", "reward_process, port = launch_server_cmd(\n", " \"\"\"\n", - "python -m sglang.launch_server --model-path Skywork/Skywork-Reward-Llama-3.1-8B-v0.2 --host 0.0.0.0 --is-embedding\n", + "python3 -m sglang.launch_server --model-path Skywork/Skywork-Reward-Llama-3.1-8B-v0.2 --host 0.0.0.0 --is-embedding\n", "\"\"\"\n", ")\n", "\n", @@ -383,7 +390,7 @@ "outputs": [], "source": [ "expert_record_server_process, port = launch_server_cmd(\n", - " \"python -m sglang.launch_server --model-path Qwen/Qwen1.5-MoE-A2.7B --host 0.0.0.0\"\n", + " \"python3 -m sglang.launch_server --model-path Qwen/Qwen1.5-MoE-A2.7B --host 0.0.0.0\"\n", ")\n", "\n", "wait_for_server(f\"http://localhost:{port}\")" @@ -449,7 +456,7 @@ "source": [ "tokenizer_free_server_process, port = launch_server_cmd(\n", " \"\"\"\n", - "python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-1B-Instruct --skip-tokenizer-init\n", + "python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --skip-tokenizer-init\n", "\"\"\"\n", ")\n", "\n", @@ -464,7 +471,7 @@ "source": [ "from transformers import AutoTokenizer\n", "\n", - "tokenizer = AutoTokenizer.from_pretrained(\"meta-llama/Llama-3.2-1B-Instruct\")\n", + "tokenizer = AutoTokenizer.from_pretrained(\"qwen/qwen2.5-0.5b-instruct\")\n", "\n", "input_text = \"What is the capital of France?\"\n", "\n", diff --git a/docs/backend/offline_engine_api.ipynb b/docs/backend/offline_engine_api.ipynb index fac3532a0..040fd17a8 100644 --- a/docs/backend/offline_engine_api.ipynb +++ b/docs/backend/offline_engine_api.ipynb @@ -83,7 +83,7 @@ " nest_asyncio.apply()\n", "\n", "\n", - "llm = sgl.Engine(model_path=\"meta-llama/Meta-Llama-3.1-8B-Instruct\")" + "llm = sgl.Engine(model_path=\"qwen/qwen2.5-0.5b-instruct\")" ] }, { diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb index b874283d4..2fc74a7be 100644 --- a/docs/backend/openai_api_completions.ipynb +++ b/docs/backend/openai_api_completions.ipynb @@ -44,7 +44,7 @@ "\n", "\n", "server_process, port = launch_server_cmd(\n", - " \"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0\"\n", + " \"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0 --mem-fraction-static 0.8\"\n", ")\n", "\n", "wait_for_server(f\"http://localhost:{port}\")\n", @@ -75,7 +75,7 @@ "client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n", "\n", "response = client.chat.completions.create(\n", - " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", + " model=\"qwen/qwen2.5-0.5b-instruct\",\n", " messages=[\n", " {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n", " ],\n", @@ -104,7 +104,7 @@ "outputs": [], "source": [ "response = client.chat.completions.create(\n", - " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", + " model=\"qwen/qwen2.5-0.5b-instruct\",\n", " messages=[\n", " {\n", " \"role\": \"system\",\n", @@ -143,7 +143,7 @@ "outputs": [], "source": [ "stream = client.chat.completions.create(\n", - " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", + " model=\"qwen/qwen2.5-0.5b-instruct\",\n", " messages=[{\"role\": \"user\", \"content\": \"Say this is a test\"}],\n", " stream=True,\n", ")\n", @@ -169,7 +169,7 @@ "outputs": [], "source": [ "response = client.completions.create(\n", - " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", + " model=\"qwen/qwen2.5-0.5b-instruct\",\n", " prompt=\"List 3 countries and their capitals.\",\n", " temperature=0,\n", " max_tokens=64,\n", @@ -198,7 +198,7 @@ "outputs": [], "source": [ "response = client.completions.create(\n", - " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", + " model=\"qwen/qwen2.5-0.5b-instruct\",\n", " prompt=\"Write a short story about a space explorer.\",\n", " temperature=0.7, # Moderate temperature for creative writing\n", " max_tokens=150, # Longer response for a story\n", @@ -257,7 +257,7 @@ " \"method\": \"POST\",\n", " \"url\": \"/chat/completions\",\n", " \"body\": {\n", - " \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", + " \"model\": \"qwen/qwen2.5-0.5b-instruct\",\n", " \"messages\": [\n", " {\"role\": \"user\", \"content\": \"Tell me a joke about programming\"}\n", " ],\n", @@ -269,7 +269,7 @@ " \"method\": \"POST\",\n", " \"url\": \"/chat/completions\",\n", " \"body\": {\n", - " \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", + " \"model\": \"qwen/qwen2.5-0.5b-instruct\",\n", " \"messages\": [{\"role\": \"user\", \"content\": \"What is Python?\"}],\n", " \"max_tokens\": 50,\n", " },\n", @@ -362,7 +362,7 @@ " \"method\": \"POST\",\n", " \"url\": \"/chat/completions\",\n", " \"body\": {\n", - " \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", + " \"model\": \"qwen/qwen2.5-0.5b-instruct\",\n", " \"messages\": [\n", " {\n", " \"role\": \"system\",\n", @@ -439,7 +439,7 @@ " \"method\": \"POST\",\n", " \"url\": \"/chat/completions\",\n", " \"body\": {\n", - " \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", + " \"model\": \"qwen/qwen2.5-0.5b-instruct\",\n", " \"messages\": [\n", " {\n", " \"role\": \"system\",\n", diff --git a/docs/backend/openai_api_embeddings.ipynb b/docs/backend/openai_api_embeddings.ipynb index 38543fa3b..742185f82 100644 --- a/docs/backend/openai_api_embeddings.ipynb +++ b/docs/backend/openai_api_embeddings.ipynb @@ -40,7 +40,7 @@ "\n", "embedding_process, port = launch_server_cmd(\n", " \"\"\"\n", - "python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct \\\n", + "python3 -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-1.5B-instruct \\\n", " --host 0.0.0.0 --is-embedding\n", "\"\"\"\n", ")\n", @@ -66,7 +66,7 @@ "text = \"Once upon a time\"\n", "\n", "curl_text = f\"\"\"curl -s http://localhost:{port}/v1/embeddings \\\n", - " -d '{{\"model\": \"Alibaba-NLP/gte-Qwen2-7B-instruct\", \"input\": \"{text}\"}}'\"\"\"\n", + " -d '{{\"model\": \"Alibaba-NLP/gte-Qwen2-1.5B-instruct\", \"input\": \"{text}\"}}'\"\"\"\n", "\n", "text_embedding = json.loads(subprocess.check_output(curl_text, shell=True))[\"data\"][0][\n", " \"embedding\"\n", @@ -94,7 +94,7 @@ "\n", "response = requests.post(\n", " f\"http://localhost:{port}/v1/embeddings\",\n", - " json={\"model\": \"Alibaba-NLP/gte-Qwen2-7B-instruct\", \"input\": text},\n", + " json={\"model\": \"Alibaba-NLP/gte-Qwen2-1.5B-instruct\", \"input\": text},\n", ")\n", "\n", "text_embedding = response.json()[\"data\"][0][\"embedding\"]\n", @@ -121,7 +121,7 @@ "\n", "# Text embedding example\n", "response = client.embeddings.create(\n", - " model=\"Alibaba-NLP/gte-Qwen2-7B-instruct\",\n", + " model=\"Alibaba-NLP/gte-Qwen2-1.5B-instruct\",\n", " input=text,\n", ")\n", "\n", @@ -150,11 +150,11 @@ "\n", "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n", "\n", - "tokenizer = AutoTokenizer.from_pretrained(\"Alibaba-NLP/gte-Qwen2-7B-instruct\")\n", + "tokenizer = AutoTokenizer.from_pretrained(\"Alibaba-NLP/gte-Qwen2-1.5B-instruct\")\n", "input_ids = tokenizer.encode(text)\n", "\n", "curl_ids = f\"\"\"curl -s http://localhost:{port}/v1/embeddings \\\n", - " -d '{{\"model\": \"Alibaba-NLP/gte-Qwen2-7B-instruct\", \"input\": {json.dumps(input_ids)}}}'\"\"\"\n", + " -d '{{\"model\": \"Alibaba-NLP/gte-Qwen2-1.5B-instruct\", \"input\": {json.dumps(input_ids)}}}'\"\"\"\n", "\n", "input_ids_embedding = json.loads(subprocess.check_output(curl_ids, shell=True))[\"data\"][\n", " 0\n", diff --git a/docs/backend/openai_api_vision.ipynb b/docs/backend/openai_api_vision.ipynb index 63a03203b..af52fcfb8 100644 --- a/docs/backend/openai_api_vision.ipynb +++ b/docs/backend/openai_api_vision.ipynb @@ -29,7 +29,7 @@ "\n", "Launch the server in your terminal and wait for it to initialize.\n", "\n", - "**Remember to add** `--chat-template llama_3_vision` **to specify the [vision chat template](https://docs.sglang.ai/backend/openai_api_vision.html#Chat-Template), otherwise, the server will only support text (images won’t be passed in), which can lead to degraded performance.**\n", + "**Remember to add** `--chat-template` **for example** `--chat-template=qwen2-vl` **to specify the [vision chat template](https://docs.sglang.ai/backend/openai_api_vision.html#Chat-Template), otherwise, the server will only support text (images won’t be passed in), which can lead to degraded performance.**\n", "\n", "We need to specify `--chat-template` for vision language models because the chat template provided in Hugging Face tokenizer only supports text." ] @@ -51,8 +51,8 @@ "\n", "vision_process, port = launch_server_cmd(\n", " \"\"\"\n", - "python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-11B-Vision-Instruct \\\n", - " --chat-template=llama_3_vision\n", + "python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-7B-Instruct \\\n", + " --chat-template=qwen2-vl\n", "\"\"\"\n", ")\n", "\n", @@ -79,7 +79,7 @@ "curl_command = f\"\"\"\n", "curl -s http://localhost:{port}/v1/chat/completions \\\\\n", " -d '{{\n", - " \"model\": \"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n", + " \"model\": \"Qwen/Qwen2.5-VL-7B-Instruct\",\n", " \"messages\": [\n", " {{\n", " \"role\": \"user\",\n", @@ -127,7 +127,7 @@ "url = f\"http://localhost:{port}/v1/chat/completions\"\n", "\n", "data = {\n", - " \"model\": \"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n", + " \"model\": \"Qwen/Qwen2.5-VL-7B-Instruct\",\n", " \"messages\": [\n", " {\n", " \"role\": \"user\",\n", @@ -167,7 +167,7 @@ "client = OpenAI(base_url=f\"http://localhost:{port}/v1\", api_key=\"None\")\n", "\n", "response = client.chat.completions.create(\n", - " model=\"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n", + " model=\"Qwen/Qwen2.5-VL-7B-Instruct\",\n", " messages=[\n", " {\n", " \"role\": \"user\",\n", @@ -211,7 +211,7 @@ "client = OpenAI(base_url=f\"http://localhost:{port}/v1\", api_key=\"None\")\n", "\n", "response = client.chat.completions.create(\n", - " model=\"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n", + " model=\"Qwen/Qwen2.5-VL-7B-Instruct\",\n", " messages=[\n", " {\n", " \"role\": \"user\",\n", diff --git a/docs/backend/send_request.ipynb b/docs/backend/send_request.ipynb index 7e23702b7..455c6431b 100644 --- a/docs/backend/send_request.ipynb +++ b/docs/backend/send_request.ipynb @@ -35,11 +35,11 @@ "\n", "# This is equivalent to running the following command in your terminal\n", "\n", - "# python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0\n", + "# python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0\n", "\n", "server_process, port = launch_server_cmd(\n", " \"\"\"\n", - "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n", + "python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct \\\n", " --host 0.0.0.0\n", "\"\"\"\n", ")\n", @@ -65,7 +65,7 @@ "curl_command = f\"\"\"\n", "curl -s http://localhost:{port}/v1/chat/completions \\\n", " -H \"Content-Type: application/json\" \\\n", - " -d '{{\"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\", \"messages\": [{{\"role\": \"user\", \"content\": \"What is the capital of France?\"}}]}}'\n", + " -d '{{\"model\": \"qwen/qwen2.5-0.5b-instruct\", \"messages\": [{{\"role\": \"user\", \"content\": \"What is the capital of France?\"}}]}}'\n", "\"\"\"\n", "\n", "response = json.loads(subprocess.check_output(curl_command, shell=True))\n", @@ -90,7 +90,7 @@ "url = f\"http://localhost:{port}/v1/chat/completions\"\n", "\n", "data = {\n", - " \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", + " \"model\": \"qwen/qwen2.5-0.5b-instruct\",\n", " \"messages\": [{\"role\": \"user\", \"content\": \"What is the capital of France?\"}],\n", "}\n", "\n", @@ -116,7 +116,7 @@ "client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n", "\n", "response = client.chat.completions.create(\n", - " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", + " model=\"qwen/qwen2.5-0.5b-instruct\",\n", " messages=[\n", " {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n", " ],\n", @@ -145,7 +145,7 @@ "\n", "# Use stream=True for streaming responses\n", "response = client.chat.completions.create(\n", - " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", + " model=\"qwen/qwen2.5-0.5b-instruct\",\n", " messages=[\n", " {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n", " ],\n",