首先克隆源代码
git clone https://github.com/vllm-project/vllm.git
在每个worker上安装依赖
cd vllm && pip install -r requirements-tpu.txt
安装库
sudo apt-get update && sudo NEEDRESTART_MODE=a apt-get install libopenblas-base libopenmpi-dev libomp-dev -y
安装vllm
cd vllm && VLLM_TARGET_DEVICE=”tpu” python setup.py develop
开启ray头节点
ray start –head –port=6379
其他worker连接头节点
ray start –address=’xx.xx.xx.xx:6379′
启动openai兼容api server
python3 -m vllm.entrypoints.openai.api_server –host=0.0.0.0 –port=8000 –tensor-parallel-size=芯片数量(4或8 由于有40个head因此此选项只能说4或8) –model=Qwen/QwQ-32B –trust-remote-code