- Published on
Chat with llama.cpp
Start llama.cpp server
./build/bin/llama-server -hf Qwen/Qwen2-0.5B-Instruct-GGUF:Q2_K
Sending a POST request using curl
curl -X POST http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"messages": [
{"role": "user", "content": "Hello! How are you today?"}
],
"max_tokens": 150,
"temperature": 0.7
}'
Parsing response
{
"choices": [
{
"finish_reason": "stop",
"index": 0,
"message": {
"role": "assistant",
"content": "I'm good, thanks! How can I assist you today?"
}
}
],
"created": 1751341871,
"model": "gpt-3.5-turbo",
"system_fingerprint": "b5787-0a5a3b5c",
"object": "chat.completion",
"usage": {
"completion_tokens": 14,
"prompt_tokens": 15,
"total_tokens": 29
},
"id": "chatcmpl-bGxdAhvPfBtQ8O5Uc9hrPGWFdv1w1m9e",
"timings": {
"prompt_n": 12,
"prompt_ms": 96.768,
"prompt_per_token_ms": 8.064,
"prompt_per_second": 124.0079365079365,
"predicted_n": 14,
"predicted_ms": 216.101,
"predicted_per_token_ms": 15.435785714285714,
"predicted_per_second": 64.78452205218855
}
}
THE END