Skip to content

Updated server_queue to delete tasks from queue when server is shutdown. Feature Request #6421 #6941

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 54 additions & 1 deletion examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
#include <thread>
#include <signal.h>
#include <memory>
#include <iostream>
#include <boost/asio.hpp>
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Boost will never be required


using json = nlohmann::ordered_json;

Expand Down Expand Up @@ -201,7 +203,7 @@ struct server_slot {

double t_prompt_processing; // ms
double t_token_generation; // ms

void reset() {
n_prompt_tokens = 0;
generated_text = "";
Expand Down Expand Up @@ -463,6 +465,50 @@ struct server_queue {
condition_tasks.notify_all();
}

//adding server health checking
std::string hostname_health = "127.0.0.1";
std::string port_health = "8080";

bool check_server_health(const std::string& server, const std::string& port) {
using namespace boost::asio;
io_service svc;
ip::tcp::socket socket(svc);
ip::tcp::resolver resolver(svc);
boost::system::error_code ec;

// Try to connect
connect(socket, resolver.resolve({server, port}), ec);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is nonsense to inner call the server itself. You have everything needed inside the process.

if (ec) {
std::cout << "Connection failed: " << ec.message() << std::endl;
return false;
}

// Send HTTP GET request to /health endpoint
std::string request = "GET /health HTTP/1.1\r\nHost: " + server + "\r\n\r\n";
write(socket, buffer(request), ec);
if (ec) {
std::cout << "Write failed: " << ec.message() << std::endl;
return false;
}

// Read the response
boost::asio::streambuf response;
read_until(socket, response, "\r\n", ec);
std::istream response_stream(&response);
std::string http_version;
unsigned int status_code;
response_stream >> http_version >> status_code;

bool server_status_ok = false;

// Check HTTP response status code
if (status_code == 200 || status_code == 500 || status_code == 503) {
server_status_ok = true;
}

return server_status_ok
}

/**
* Main loop consists of these steps:
* - Wait until a new task arrives
Expand All @@ -474,6 +520,13 @@ struct server_queue {
running = true;

while (true) {
bool health_check = check_server_health(hostname_health, port_health);
if (health_check == false) {
while(!queue_tasks.empty()) {
queue_tasks.erase(queue_tasks.begin());
}
break;
}
LOG_VERBOSE("new task may arrive", {});

while (true) {
Expand Down
87 changes: 87 additions & 0 deletions examples/server/tests/req-cancel-testing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import threading
import requests

# Stats
total_requests = 0
requests_executed = 0
requests_cancelled = 0
requests_remaining = 0

class StoppableThread(threading.Thread):
def __init__(self, *args, **kwargs):
super(StoppableThread, self).__init__(*args, **kwargs)
self.stop_event = threading.Event()

def stop(self):
self.stop_event.set()

def stopped(self):
return self.stop_event.is_set()

def send_request(stop_event):
try:
url = 'http://127.0.0.1:8080/completion'
data = {
'prompt': 'Hello llama',
'n_predict': 2
}
if not stop_event.is_set():
response = requests.post(url, json=data, timeout=60) # Reduced timeout for testing
print('Response:', response.text)
global requests_executed
requests_executed += 1
except requests.exceptions.Timeout:
print('Request timed out')
except Exception as e:
print('An error occurred:', str(e))

def get_health():
try:
url = 'http://127.0.0.1:8080/health'
response = requests.get(url, timeout=10)
return response.status_code
except requests.exceptions.Timeout:
print('Health check timed out')
return
except Exception as e:
print('An error occurred during health check:', str(e))
return


# User input for the number of requests
num_requests = int(input("How many requests would you like to post?\n"))

total_requests = num_requests

# Launching multiple requests
for i in range(num_requests):
health = get_health()
ok_status = False ##our server status

if health == 503 or health == 500 or health == 200:
ok_status = True

if ok_status == False:
print(f"Server is not running. Status:{health}. Exiting now...\n")
requests_cancelled = total_requests - i
break

stop_event = threading.Event()
req_thread = StoppableThread(target=send_request, args=(stop_event,))
req_thread.start()

input("Press Enter when request is complete or you would like to stop the request!\n")
if not stop_event.is_set():
stop_event.set()

req_thread.join() # Ensure the thread finishes

requests_remaining = total_requests - requests_executed - requests_cancelled

print("\nSummary:")
print(f"Total requests: {total_requests}")
print(f"Requests executed: {requests_executed}")
print(f"Requests cancelled: {requests_cancelled}")
print(f"Requests remaining: {requests_remaining}")


Loading