搭建通义千问模型
使用ollama部署(推荐)
# N卡支持
apt-get install -y nvidia-container-toolkit
# ollama/ollama
docker run -d --gpus=all -v $(pwd)/ollama:/root/.ollama -p 11434:11434 --name ollama registry.cn-hangzhou.aliyuncs.com/jcleng/ollama-ollama
# 无显卡也支持直接运行
docker run -d -v $(pwd)/ollama:/root/.ollama -p 11434:11434 --name ollama registry.cn-hangzhou.aliyuncs.com/jcleng/ollama-ollama
# 模型仓库: https://ollama.com/library
# 千问: https://ollama.ai/library/qwen/tags
# 下载比较快
docker exec -it ollama ollama run qwen:0.5b
# ollama webui
# ghcr.io/ollama-webui/ollama-webui:main
docker run -d -p 8080:8080 --add-host=host.docker.internal:host-gateway --name ollama-webui --restart always registry.cn-hangzhou.aliyuncs.com/jcleng/ollama-webui-ollama-webui:main
# api 使用
https://github.com/ollama/ollama/blob/main/docs/api.md
官方部署(不推荐)
# 准备模型文件 https://modelscope.cn/models/qwen/Qwen-7B-Chat/files
git lfs install
git clone https://www.modelscope.cn/qwen/Qwen-7B-Chat.git
# 配置nvidia-container-toolkit
nvidia-ctk runtime configure --runtime=docker
sudo systemctl restart docker
docker run --rm --runtime=nvidia --gpus all ubuntu nvidia-smi
# 初始化
IMAGE_NAME=qwenllm/qwen:cu117
PORT=8901
CHECKPOINT_PATH=$(pwd)/Qwen-7B-Chat # 下载到本地的模型及代码路径
bash docker/docker_openai_api.sh -i ${IMAGE_NAME} -c ${CHECKPOINT_PATH} --port ${PORT}
bash docker/docker_web_demo.sh -i ${IMAGE_NAME} -c ${CHECKPOINT_PATH} --port ${PORT}
bash docker/docker_cli_demo.sh -i ${IMAGE_NAME} -c ${CHECKPOINT_PATH}
# 你可以打开http://localhost:${PORT} 来使用该Demo。
离线使用ModelScope的模型
# 加载离线模型
# 下载/files所有文件
git clone https://www.modelscope.cn/qwen/Qwen2-0.5b.git
# 使用modelscope的docker镜像py38,且更新transformers; 否则KeyError: 'qwen2'
pip install transformers -U
#!/opt/conda/bin/python3
# https://qwen.readthedocs.io/en/latest/inference/chat.html
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cpu" # cuda/cpu
# Now you do not need to add "trust_remote_code=True"
model = AutoModelForCausalLM.from_pretrained(
"qwen/Qwen2-0.5B",
torch_dtype="auto",
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("qwen/Qwen2-0.5B")
# Instead of using model.chat(), we directly use model.generate()
# But you need to use tokenizer.apply_chat_template() to format your inputs as shown below
prompt = "你是哪位?"
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)
# Directly use generate() and tokenizer.decode() to get the output.
# Use `max_new_tokens` to control the maximum output length.
generated_ids = model.generate(
model_inputs.input_ids,
max_new_tokens=512
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)
modelscope 镜像环境使用
# https://github.com/modelscope/modelscope/blob/master/README_zh.md#%E9%95%9C%E5%83%8F
# CPU
# py38
registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-py38-torch2.0.1-tf2.13.0-1.9.5
# GPU
# py38
registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.8.0-py38-torch2.0.1-tf2.13.0-1.9.5
docker tag registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-py38-torch2.0.1-tf2.13.0-1.9.5 modelscope
# 测试, run.py就是上面的代码
docker run --rm -it -v $(pwd):$(pwd) --name=sk -w $(pwd) modelscope bash
pip install transformers -U
# 直接运行
docker run --rm -v $(pwd):$(pwd) --name=sk -w $(pwd) modelscope ./run.py