搭建通义千问模型

  • 使用ollama部署(推荐)

# N卡支持
apt-get install -y nvidia-container-toolkit
# ollama/ollama
docker run -d --gpus=all -v $(pwd)/ollama:/root/.ollama -p 11434:11434 --name ollama registry.cn-hangzhou.aliyuncs.com/jcleng/ollama-ollama

# 无显卡也支持直接运行
docker run -d -v $(pwd)/ollama:/root/.ollama -p 11434:11434 --name ollama registry.cn-hangzhou.aliyuncs.com/jcleng/ollama-ollama


# 模型仓库: https://ollama.com/library
# 千问: https://ollama.ai/library/qwen/tags
# 下载比较快
docker exec -it ollama ollama run qwen:0.5b

# ollama webui
# ghcr.io/ollama-webui/ollama-webui:main
docker run -d -p 8080:8080 --add-host=host.docker.internal:host-gateway --name ollama-webui --restart always registry.cn-hangzhou.aliyuncs.com/jcleng/ollama-webui-ollama-webui:main

# api 使用
https://github.com/ollama/ollama/blob/main/docs/api.md

../_images/1719386482008.jpg

  • 官方部署(不推荐)

# 准备模型文件 https://modelscope.cn/models/qwen/Qwen-7B-Chat/files
git lfs install
git clone https://www.modelscope.cn/qwen/Qwen-7B-Chat.git

# 配置nvidia-container-toolkit
nvidia-ctk runtime configure --runtime=docker
sudo systemctl restart docker
docker run --rm --runtime=nvidia --gpus all ubuntu nvidia-smi

# 初始化
IMAGE_NAME=qwenllm/qwen:cu117
PORT=8901
CHECKPOINT_PATH=$(pwd)/Qwen-7B-Chat   # 下载到本地的模型及代码路径
bash docker/docker_openai_api.sh -i ${IMAGE_NAME} -c ${CHECKPOINT_PATH} --port ${PORT}
bash docker/docker_web_demo.sh -i ${IMAGE_NAME} -c ${CHECKPOINT_PATH} --port ${PORT}
bash docker/docker_cli_demo.sh -i ${IMAGE_NAME} -c ${CHECKPOINT_PATH}

# 你可以打开http://localhost:${PORT} 来使用该Demo。
  • 离线使用ModelScope的模型

# 加载离线模型
# 下载/files所有文件
git clone https://www.modelscope.cn/qwen/Qwen2-0.5b.git

# 使用modelscope的docker镜像py38,且更新transformers; 否则KeyError: 'qwen2'
pip install transformers -U

#!/opt/conda/bin/python3
# https://qwen.readthedocs.io/en/latest/inference/chat.html

from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cpu" # cuda/cpu

# Now you do not need to add "trust_remote_code=True"
model = AutoModelForCausalLM.from_pretrained(
    "qwen/Qwen2-0.5B",
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("qwen/Qwen2-0.5B")

# Instead of using model.chat(), we directly use model.generate()
# But you need to use tokenizer.apply_chat_template() to format your inputs as shown below
prompt = "你是哪位?"
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)

# Directly use generate() and tokenizer.decode() to get the output.
# Use `max_new_tokens` to control the maximum output length.
generated_ids = model.generate(
    model_inputs.input_ids,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)

  • modelscope 镜像环境使用

# https://github.com/modelscope/modelscope/blob/master/README_zh.md#%E9%95%9C%E5%83%8F

# CPU
# py38
registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-py38-torch2.0.1-tf2.13.0-1.9.5

# GPU
# py38
registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.8.0-py38-torch2.0.1-tf2.13.0-1.9.5


docker tag registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-py38-torch2.0.1-tf2.13.0-1.9.5 modelscope
# 测试, run.py就是上面的代码
docker run --rm -it -v $(pwd):$(pwd) --name=sk -w $(pwd) modelscope bash
pip install transformers -U
# 直接运行
docker run --rm -v $(pwd):$(pwd) --name=sk -w $(pwd) modelscope ./run.py
c