Thiết kế website giá rẻ

Question

use actix_web::{get, post, web, App, HttpServer, HttpResponse, Responder};
use serde::{Deserialize, Serialize};
use env_logger::Env;
use log::info;

#[get("/")]
async fn hello() -> impl Responder {
    info!("Request received");
    "Hello, world!"
}

#[post("/health")]
async fn echo(req_body: String) -> impl Responder {
    format!("You sent: {}", req_body)
}

#[derive(Deserialize)]
struct Info {
    name: String,
}

#[derive(Serialize)]
struct Greeting {
    message: String,
}

#[post("/json")]
async fn json_handler(info: web::Json<Info>) -> impl Responder {
    let response = Greeting {
        message: format!("Hello, {}!", info.name),
    };
    
    web::Json(response) // Respond with JSON
}
use tokio::io::{AsyncBufReadExt, BufReader};
use tokio::process::Command;

#[get("/run-script")]
async fn run_script() -> impl Responder {
    // Spawn the async task to run the llama.cpp server in a separate thread
    let _llama_thread = tokio::spawn(async {
        run_llama_server().await;
    });

    // Log that the script has been triggered
    println!("Main server thread running...");

    HttpResponse::Ok().body("Llama.cpp server started successfully!")
}

async fn run_llama_server() {
    // Command to run the llama.cpp shell script
    let mut child = Command::new("sh")
        .arg("./src/public/run-model.sh")  // Path to your llama.cpp script
        .stdout(std::process::Stdio::piped())  // Capture stdout
        .stderr(std::process::Stdio::piped())  // Capture stderr
        .spawn()
        .expect("Failed to start the llama.cpp server");

    // Capture and process the output in real-time
    let stdout = child.stdout.take().unwrap();
    let reader = BufReader::new(stdout);
    let mut lines = reader.lines();

    while let Some(line) = lines.next_line().await.unwrap() {
        // Print the output from the shell script to the main thread stdout
        println!("Llama server log: {}", line);
    }

    // Wait for the process to finish (don't await child directly, use .wait().await)
    child.wait().await.expect("Failed to wait on llama.cpp server process");
}


#[actix_web::main]
async fn main() -> std::io::Result<()> {
    env_logger::Builder::from_env(Env::default().default_filter_or("info")).init();

    // Initialize the shared state to store the PID

    HttpServer::new(move || {
        App::new()
            .service(hello) // Register the GET route
            .service(echo)  // Register the POST route
            .service(json_handler)  // Register the POST route for JSON
            .service(run_script)  // Register the GET route for running script
    })
    .bind("localhost:8080")?
    .run()
    .await
}

and my run-model.sh

#!/bin/bash

VERSION="b3658" # Change this if the version changes
INSTALL_DIR="$HOME/.pyano"
BUILD_DIR="$INSTALL_DIR/build/bin"
MODEL_DIR="$HOME/.pyano/models"
MODEL_PATH="$MODEL_DIR/$MODEL_NAME"
MODEL_NAME=""
# MODEL_URL="https://huggingface.co/lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF/resolve/main/Meta-Llama-3.1-8B-Instruct-Q8_0.gguf"

# Function to get system RAM in GB
get_system_ram() {
    if [[ "$OSTYPE" == "darwin"* ]]; then
        # macOS
        ram_gb=$(( $(sysctl -n hw.memsize) / 1024 / 1024 / 1024 ))
    elif [[ "$OSTYPE" == "linux-gnu"* ]]; then
        # Linux
        ram_gb=$(( $(grep MemTotal /proc/meminfo | awk '{print $2}') / 1024 / 1024 ))
    else
        echo "Unsupported OS type: $OSTYPE"
        exit 1
    fi
}

# Function to select model based on RAM
select_model() {
    get_system_ram
    if [ $ram_gb -lt 9 ]; then
        MODEL_NAME="qwen2.5-coder-1.5b-instruct-q8_0.gguf"
        MODEL_URL="https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF/resolve/main/qwen2.5-coder-1.5b-instruct-q8_0.gguf"
        # MODEL_URL="https://huggingface.co/bartowski/Phi-3.5-mini-instruct-GGUF/resolve/main/Phi-3.5-mini-instruct-Q3_K_L.gguf"
        CTX=8192
        BATCH_SIZE=1024 #It's the number of tokens in the prompt that are fed into the model at a time. For example, if your prompt 
                        #is 8 tokens long at the batch size is 4, then it'll send two chunks of 4. It may be more efficient to process 
                        # in larger chunks. For some models or approaches, sometimes that is the case. It will depend on how llama.cpp handles it.
                        #larger BATCH size is speedup processing but more load on Memory
        GPU_LAYERS_OFFLOADED=16 #The number of layers to put on the GPU. The rest will be on the CPU. If you don't know how many layers there are, 
                                #you can use -1 to move all to GPU.

    elif [ $ram_gb -gt 24 ]; then
        MODEL_NAME="Meta-Llama-3.1-70B-Instruct-IQ1_M.gguf"
        MODEL_URL="https://huggingface.co/bartowski/Meta-Llama-3.1-70B-Instruct-GGUF/resolve/main/Meta-Llama-3.1-70B-Instruct-IQ1_M.gguf"
        CTX=32768
        BATCH_SIZE=8192 #It's the number of tokens in the prompt that are fed into the model at a time. For example, if your prompt 
                        #is 8 tokens long at the batch size is 4, then it'll send two chunks of 4. It may be more efficient to process 
                        # in larger chunks. For some models or approaches, sometimes that is the case. It will depend on how llama.cpp handles it.
                        #larger BATCH size is speedup processing but more load on Memory        
        GPU_LAYERS_OFFLOADED=-1 #The number of layers to put on the GPU. The rest will be on the CPU. If you don't know how many layers there are, you can use -1 to move all to GPU.
    else

        # MODEL_NAME="Llama-3.1-SuperNova-Lite-Q6_K_L.gguf"
        # MODEL_URL="https://huggingface.co/bartowski/Llama-3.1-SuperNova-Lite-GGUF/resolve/main/Llama-3.1-SuperNova-Lite-Q6_K_L.gguf"

        MODEL_NAME="Qwen2.5-Coder-7B-Instruct-Q8_0.gguf"
        MODEL_URL="https://huggingface.co/bartowski/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/Qwen2.5-Coder-7B-Instruct-Q8_0.gguf"

        # MODEL_NAME="Qwen2.5-14B-Instruct-IQ3_XS.gguf"
        # MODEL_URL="https://huggingface.co/bartowski/Qwen2.5-14B-Instruct-GGUF/resolve/main/Qwen2.5-14B-Instruct-IQ3_XS.gguf"
        CTX=16192
        BATCH_SIZE=4096 #It's the number of tokens in the prompt that are fed into the model at a time. For example, if your prompt 
                        #is 8 tokens long at the batch size is 4, then it'll send two chunks of 4. It may be more efficient to process 
                        # in larger chunks. For some models or approaches, sometimes that is the case. It will depend on how llama.cpp handles it.
                        #larger BATCH size is speedup processing but more load on Memory        
        GPU_LAYERS_OFFLOADED=-1 #The number of layers to put on the GPU. The rest will be on the CPU. If you don't know how many layers there are, 
                        #you can use -1 to move all to GPU.
    fi
    MODEL_PATH="$MODEL_DIR/$MODEL_NAME"
}

select_model

# Function to determine OS and set the download URL and ZIP file name
set_download_info() {
    if [[ "$OSTYPE" == "darwin"* ]]; then
        ZIP_FILE="llama-$VERSION-bin-macos-arm64.zip"
        DOWNLOAD_URL="https://github.com/ggerganov/llama.cpp/releases/download/$VERSION/$ZIP_FILE"
    elif [[ "$OSTYPE" == "linux-gnu"* ]]; then
        ZIP_FILE="llama-$VERSION-bin-ubuntu-x64.zip"
        DOWNLOAD_URL="https://github.com/ggerganov/llama.cpp/releases/download/$VERSION/$ZIP_FILE"
    else
        echo "Unsupported OS type: $OSTYPE"
        exit 1
    fi
}

# Install python3.12-venv and python3.12-dev on Ubuntu
if [[ "$OSTYPE" == "linux-gnu"* ]]; then
    sudo apt-get install -y unzip
fi
# Function to check if the model file is present and download it if not
check_and_download_model() {
    # local model_file="Meta-Llama-3.1-8B-Instruct-Q8_0.gguf"

    # Create the model directory if it doesn't exist
    mkdir -p $MODEL_DIR

    # Check if the model file exists
    if [[ ! -f $MODEL_PATH ]]; then
        echo "Model file $MODEL_NAME not found. Downloading..."

        # Determine which download tool is available, and install wget if neither is found
        "$(dirname "$0")/download_file" $MODEL_URL $MODEL_PATH
        # if command -v curl &> /dev/null; then
        #     curl -Lo $MODEL_PATH $MODEL_URL
        # elif command -v wget &> /dev/null; then
        #     wget -O $MODEL_PATH $MODEL_URL
        # else
        #     echo "Neither curl nor wget is installed. Installing wget..."
        #     if [[ "$OSTYPE" == "linux-gnu"* ]]; then
        #         sudo apt-get update && sudo apt-get install -y wget
        #     elif [[ "$OSTYPE" == "darwin"* ]]; then
        #         brew install wget
        #     else
        #         echo "Unsupported OS for automatic wget installation. Please install curl or wget manually."
        #         exit 1
        #     fi
        #     wget -O $MODEL_PATH $MODEL_URL
        # fi
        
        echo "Model file downloaded to $MODEL_DIR/$MODEL_NAME."
    else
        echo "Model file $MODEL_NAME already exists in $MODEL_DIR."
    fi
}


# Function to install Python 3.12 using Homebrew (macOS) or package manager (Linux)
install_requirements_llama() {

    if [[ "$OSTYPE" == "linux-gnu"* ]]; then
        sudo apt-get update
        sudo apt-get install -y libgomp1
    fi

}



# Function to download and unzip if the version is not present
download_and_unzip() {
    # Check if curl or wget is installed and set DOWNLOAD_CMD accordingly
    if command -v curl &> /dev/null; then
        DOWNLOAD_CMD="curl -Lo"
    elif command -v wget &> /dev/null; then
        DOWNLOAD_CMD="wget -P"
    else
        echo "Neither curl nor wget is installed. Installing curl..."
        if [[ "$OSTYPE" == "linux-gnu"* ]]; then
            sudo apt-get update && sudo apt-get install -y curl
        elif [[ "$OSTYPE" == "darwin"* ]]; then
            brew install curl
        else
            echo "Unsupported OS for automatic curl installation. Please install curl or wget manually."
            exit 1
        fi
        DOWNLOAD_CMD="curl -Lo"
    fi

    # Create the ~/.pyano/ directory if it doesn't exist
    mkdir -p $INSTALL_DIR

    # Download the appropriate file based on the OS
    if [[ ! -f "$INSTALL_DIR/$ZIP_FILE" ]]; then
        echo "Downloading $ZIP_FILE..."
        $DOWNLOAD_CMD $INSTALL_DIR/$ZIP_FILE $DOWNLOAD_URL

        # Unzip the downloaded file
        echo "Unzipping $ZIP_FILE..."
        unzip $INSTALL_DIR/$ZIP_FILE -d $INSTALL_DIR/
    else
        echo "$ZIP_FILE already exists, skipping download and unzip."
    fi
}

check_and_download_model

# Set download info based on the OS
set_download_info

# Download and unzip the file
download_and_unzip
install_requirements_llama

# Ensure MODEL_PATH is set
if [ -z "$MODEL_PATH" ]; then
    echo "MODEL_PATH is not set. Please set the path to your model."
    exit 1
fi

# Download and unzip if necessary
download_and_unzip

# Calculate the number of CPU cores
get_num_cores() {
    if command -v nproc &> /dev/null; then
        # Linux
        num_cores=$(nproc)
    elif [[ "$OSTYPE" == "linux-gnu"* ]]; then
        # Linux fallback
        num_cores=$(grep -c ^processor /proc/cpuinfo)
    elif [[ "$OSTYPE" == "darwin"* ]]; then
        # macOS
        num_cores=$(sysctl -n hw.ncpu)
    elif [[ "$OSTYPE" == "bsd"* ]]; then
        # BSD
        num_cores=$(sysctl -n hw.ncpu)
    else
        echo "Unsupported OS type: $OSTYPE"
        return 1
    fi
}
get_num_cores
echo "Model being used $MODEL_PATH"
echo "Number of cores are  $num_cores"

# Run the server command
$BUILD_DIR/llama-server 
  -m $MODEL_PATH 
  --ctx-size $CTX 
  --parallel 2 
  --n-gpu-layers $GPU_LAYERS_OFFLOADED
  --port 52555 
  --threads $num_cores 
  --color 
  --metrics 
    --batch-size $BATCH_SIZE 
  --numa isolate 
  --mlock 
    --no-mmap 
    --conversation 
    --flash-attn 
  --cache-type-k f16 
  --cache-type-v f16 
   --prompt-cache-all 
   --repeat-last-n 64 
   --repeat-penalty 1.3 
   --top-k 40 
   --top-p 0.9

I have tried multiple times but the server starts and print only few lines of logs and suddenly print this line

[2024-09-22T11:12:07Z INFO  actix_server::server] SIGTERM received; starting graceful shutdown

Thiết kế website giá rẻ

Danh mục

Running llama.cpp server from a rust tokio thread