feat: add cache_salt support to request (#10718)
Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
This commit is contained in:
@@ -207,6 +207,84 @@ class TestCacheReport(CustomTestCase):
|
||||
|
||||
# asyncio.run(run_test())
|
||||
|
||||
def test_cache_salt_effectiveness(self):
|
||||
print("=" * 100)
|
||||
print("Testing cache_salt effectiveness")
|
||||
|
||||
# Use a unique message to avoid interference with other tests
|
||||
test_message = "What is the capital of Japan?"
|
||||
|
||||
# First request with cache_salt "salt1"
|
||||
response1 = self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[{"role": "user", "content": test_message}],
|
||||
temperature=0,
|
||||
max_tokens=10,
|
||||
extra_body={"cache_salt": "salt1"},
|
||||
)
|
||||
cached_tokens_1_first = int(response1.usage.prompt_tokens_details.cached_tokens)
|
||||
prompt_tokens_1 = int(response1.usage.prompt_tokens)
|
||||
print(
|
||||
f"First request with salt1 - cached_tokens: {cached_tokens_1_first}, prompt_tokens: {prompt_tokens_1}"
|
||||
)
|
||||
|
||||
# Second request with same cache_salt "salt1" - should get cache hit
|
||||
response2 = self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[{"role": "user", "content": test_message}],
|
||||
temperature=0,
|
||||
max_tokens=10,
|
||||
extra_body={"cache_salt": "salt1"},
|
||||
)
|
||||
cached_tokens_1_second = int(
|
||||
response2.usage.prompt_tokens_details.cached_tokens
|
||||
)
|
||||
print(
|
||||
f"Second request with salt1 - cached_tokens: {cached_tokens_1_second}, prompt_tokens: {prompt_tokens_1}"
|
||||
)
|
||||
|
||||
# Verify cache hit for same salt
|
||||
assert (
|
||||
cached_tokens_1_second > cached_tokens_1_first
|
||||
), "Should have cache hit with same cache_salt"
|
||||
assert (
|
||||
cached_tokens_1_second == prompt_tokens_1 - 1
|
||||
), "Should cache all prompt tokens except the last one"
|
||||
|
||||
# Third request with different cache_salt "salt2" - should not get cache hit
|
||||
response3 = self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[{"role": "user", "content": test_message}],
|
||||
temperature=0,
|
||||
max_tokens=10,
|
||||
extra_body={"cache_salt": "salt2"},
|
||||
)
|
||||
cached_tokens_2_first = int(response3.usage.prompt_tokens_details.cached_tokens)
|
||||
print(f"First request with salt2 - cached_tokens: {cached_tokens_2_first}")
|
||||
|
||||
# Verify no cache hit for different salt (should be similar to first request with salt1)
|
||||
assert (
|
||||
cached_tokens_2_first <= cached_tokens_1_first + self.min_cached
|
||||
), "Different cache_salt should not share cache"
|
||||
|
||||
# Fourth request with same cache_salt "salt2" - should now get cache hit
|
||||
response4 = self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[{"role": "user", "content": test_message}],
|
||||
temperature=0,
|
||||
max_tokens=10,
|
||||
extra_body={"cache_salt": "salt2"},
|
||||
)
|
||||
cached_tokens_2_second = int(
|
||||
response4.usage.prompt_tokens_details.cached_tokens
|
||||
)
|
||||
print(f"Second request with salt2 - cached_tokens: {cached_tokens_2_second}")
|
||||
|
||||
# Verify cache hit for salt2
|
||||
assert (
|
||||
cached_tokens_2_second == cached_tokens_2_first
|
||||
), "Should have cache hit with same cache_salt for salt2"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
Reference in New Issue
Block a user