feat: add cache_salt support to request (#10718)

Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
This commit is contained in:
Xinyuan Tong
2025-09-23 23:30:25 -07:00
committed by GitHub
parent b1f0fc1c0b
commit 71f24ef8f6
9 changed files with 117 additions and 1 deletions

View File

@@ -207,6 +207,84 @@ class TestCacheReport(CustomTestCase):
# asyncio.run(run_test())
def test_cache_salt_effectiveness(self):
print("=" * 100)
print("Testing cache_salt effectiveness")
# Use a unique message to avoid interference with other tests
test_message = "What is the capital of Japan?"
# First request with cache_salt "salt1"
response1 = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": test_message}],
temperature=0,
max_tokens=10,
extra_body={"cache_salt": "salt1"},
)
cached_tokens_1_first = int(response1.usage.prompt_tokens_details.cached_tokens)
prompt_tokens_1 = int(response1.usage.prompt_tokens)
print(
f"First request with salt1 - cached_tokens: {cached_tokens_1_first}, prompt_tokens: {prompt_tokens_1}"
)
# Second request with same cache_salt "salt1" - should get cache hit
response2 = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": test_message}],
temperature=0,
max_tokens=10,
extra_body={"cache_salt": "salt1"},
)
cached_tokens_1_second = int(
response2.usage.prompt_tokens_details.cached_tokens
)
print(
f"Second request with salt1 - cached_tokens: {cached_tokens_1_second}, prompt_tokens: {prompt_tokens_1}"
)
# Verify cache hit for same salt
assert (
cached_tokens_1_second > cached_tokens_1_first
), "Should have cache hit with same cache_salt"
assert (
cached_tokens_1_second == prompt_tokens_1 - 1
), "Should cache all prompt tokens except the last one"
# Third request with different cache_salt "salt2" - should not get cache hit
response3 = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": test_message}],
temperature=0,
max_tokens=10,
extra_body={"cache_salt": "salt2"},
)
cached_tokens_2_first = int(response3.usage.prompt_tokens_details.cached_tokens)
print(f"First request with salt2 - cached_tokens: {cached_tokens_2_first}")
# Verify no cache hit for different salt (should be similar to first request with salt1)
assert (
cached_tokens_2_first <= cached_tokens_1_first + self.min_cached
), "Different cache_salt should not share cache"
# Fourth request with same cache_salt "salt2" - should now get cache hit
response4 = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": test_message}],
temperature=0,
max_tokens=10,
extra_body={"cache_salt": "salt2"},
)
cached_tokens_2_second = int(
response4.usage.prompt_tokens_details.cached_tokens
)
print(f"Second request with salt2 - cached_tokens: {cached_tokens_2_second}")
# Verify cache hit for salt2
assert (
cached_tokens_2_second == cached_tokens_2_first
), "Should have cache hit with same cache_salt for salt2"
if __name__ == "__main__":
unittest.main()