搭建通义千问模型
使用ollama部署(推荐)
# N卡支持
# 查看当前状态, 如果configuration的驱动是nouveau需要关闭: 教程 https://dl.2011101.xyz/%E6%95%99%E7%A8%8B/How%20to%20install%20Nvidia%20driver%20on%20CentOS%207%20Linux%20-%20nixCraft%20%282024_7_16%2016_26_23%29.html
lshw -C display
# driver=nouveau
# 下载官方驱动: https://www.nvidia.com/download/index.aspx?lang=en-us 实现宿主机的nvidia-smi命令
# 解决安装驱动提示没有kernel-source的问题
yum install kernel-devel kernel-headers
yum install "kernel-devel-uname-r == $(uname -r)"
# 是否安装成功
lshw -C display
# configuration: driver=nvidia latency=0
# 安装container运行时工具:
apt-get install -y nvidia-container-toolkit
nvidia-ctk runtime configure --runtime=docker
cat /etc/docker/daemon.json
# {
# "runtimes": {
# "nvidia": {
# "args": [],
# "path": "nvidia-container-runtime"
# }
# }
# }
systemctl restart docker
# 测试显卡
docker run --privileged --rm -it --gpus=all --runtime=nvidia registry.cn-hangzhou.aliyuncs.com/jcleng/tiborvass-nvidia-smi:latest nvidia-smi
# ollama/ollama
docker run -it --rm --gpus=all --runtime=nvidia -v $(pwd)/ollama:/root/.ollama -p 11434:11434 --name ollama registry.cn-hangzhou.aliyuncs.com/jcleng/ollama-ollama
docker run -d --gpus=all --runtime=nvidia -v $(pwd)/ollama:/root/.ollama -p 11434:11434 --name ollama registry.cn-hangzhou.aliyuncs.com/jcleng/ollama-ollama
# A卡
docker run --rm \
--privileged \
-e HSA_OVERRIDE_GFX_VERSION=gfx1031 \
--device=/dev/kfd \
--device=/dev/dri \
-v $(pwd)/ollama:/root/.ollama -p 11434:11434 \
--name ollama_amd registry.cn-hangzhou.aliyuncs.com/jcleng/ollama-ollama:rocm
# 无显卡也支持直接运行
docker run -d \
-v $(pwd)/ollama:/root/.ollama \
-p 11434:11434 \
--name ollama registry.cn-hangzhou.aliyuncs.com/jcleng/ollama-ollama
# 模型仓库: https://ollama.com/library
# 千问: https://ollama.ai/library/qwen/tags
# 下载比较快
docker exec -it ollama ollama run qwen:0.5b
# ollama webui
# ghcr.io/ollama-webui/ollama-webui:main
docker run -d -p 2081:8080 --add-host=host.docker.internal:host-gateway --name ollama-webui --restart always registry.cn-hangzhou.aliyuncs.com/jcleng/ollama-webui-ollama-webui:main
# api 使用
https://github.com/ollama/ollama/blob/main/docs/api.md
# 调整ollama默认端口和监听地址
https://github.com/ollama/ollama/issues/2194#issuecomment-1911149296
OLLAMA_HOST=0.0.0.0:11434
webapi
###
curl http://192.168.195.50:11434/api/generate -d '{
"model": "qwen2:7b",
"stream": false,
"prompt": "你是?"
}'

官方部署(不推荐)
# 准备模型文件 https://modelscope.cn/models/qwen/Qwen-7B-Chat/files
git lfs install
git clone https://www.modelscope.cn/qwen/Qwen-7B-Chat.git
# 配置nvidia-container-toolkit
nvidia-ctk runtime configure --runtime=docker
sudo systemctl restart docker
docker run --rm --runtime=nvidia --gpus all ubuntu nvidia-smi
# 初始化
IMAGE_NAME=qwenllm/qwen:cu117
PORT=8901
CHECKPOINT_PATH=$(pwd)/Qwen-7B-Chat # 下载到本地的模型及代码路径
bash docker/docker_openai_api.sh -i ${IMAGE_NAME} -c ${CHECKPOINT_PATH} --port ${PORT}
bash docker/docker_web_demo.sh -i ${IMAGE_NAME} -c ${CHECKPOINT_PATH} --port ${PORT}
bash docker/docker_cli_demo.sh -i ${IMAGE_NAME} -c ${CHECKPOINT_PATH}
# 你可以打开http://localhost:${PORT} 来使用该Demo。
离线使用ModelScope的模型
# 加载离线模型
# 下载/files所有文件
git clone https://www.modelscope.cn/qwen/Qwen2-0.5b.git
# 使用modelscope的docker镜像py38,且更新transformers; 否则KeyError: 'qwen2'
pip install transformers -U
#!/opt/conda/bin/python3
# https://qwen.readthedocs.io/en/latest/inference/chat.html
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cpu" # cuda/cpu
# Now you do not need to add "trust_remote_code=True"
model = AutoModelForCausalLM.from_pretrained(
"qwen/Qwen2-0.5B",
torch_dtype="auto",
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("qwen/Qwen2-0.5B")
# Instead of using model.chat(), we directly use model.generate()
# But you need to use tokenizer.apply_chat_template() to format your inputs as shown below
prompt = "你是哪位?"
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)
# Directly use generate() and tokenizer.decode() to get the output.
# Use `max_new_tokens` to control the maximum output length.
generated_ids = model.generate(
model_inputs.input_ids,
max_new_tokens=512
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)
modelscope 镜像环境使用
# https://github.com/modelscope/modelscope/blob/master/README_zh.md#%E9%95%9C%E5%83%8F
# CPU
# py38
registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-py38-torch2.0.1-tf2.13.0-1.9.5
# GPU
# py38
registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.8.0-py38-torch2.0.1-tf2.13.0-1.9.5
docker tag registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-py38-torch2.0.1-tf2.13.0-1.9.5 modelscope
# 测试, run.py就是上面的代码
docker run --rm -it -v $(pwd):$(pwd) --name=sk -w $(pwd) modelscope bash
pip install transformers -U
# 直接运行
docker run --rm -v $(pwd):$(pwd) --name=sk -w $(pwd) modelscope ./run.py
windows-amd使用ollama-for-amd
https://github.com/likelovewant/ollama-for-amd/issues/1
# 运行之后查看运行日志, 提示:
HSA_OVERRIDE_GFX_VERSION usage
# 设置变量环境之后重新运行
HSA_OVERRIDE_GFX_VERSION=gfx803
# 日志提示: 就使用了gpu了
time=2024-07-22T09:51:28.812+08:00 level=INFO source=types.go:105 msg="inference compute" id=1 library=rocm compute=gfx803 driver=5.4 name="AMD Radeon RX 580 2048SP" total="8.0 GiB" available="7.9 GiB"
# 无法运行, 可以尝试:
# No such file or directory for GPU arch : gfx803
https://github.com/likelovewant/ROCmLibs-for-gfx1103-AMD780M-APU/releases/tag/v0.5.7
# 测试 RX 580 2048SP 使用likelovewant的rocblas才能运行, 且偶尔会失败
ollama run qwen2:0.5b
ollama run qwen2:7b
模型转.gguf格式给ollama使用
测试转的Qwen2-0.5B存在幻答,比不上官方的.gguf https://github.com/QwenLM/Qwen2/issues/57#issuecomment-1963370813
# 模型仓库
https://huggingface.co/models?library=gguf&language=zh&sort=downloads
https://www.modelscope.cn/models
# 下载模型
git lfs install
git clone https://www.modelscope.cn/qwen/Qwen2-0.5B.git
# 使用 llama.cpp 转化, clone模型, 里面是.safetensors的文件,tokenizer_config.json,tokenizer.model
# https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#hot-topics
# 或者: https://ollama.fan/getting-started/import/#importing-pytorch-safetensors
git clone --depth=1 https://github.com/ggerganov/llama.cpp
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple sentencepiece numpy torch pyyaml safetensors transformers
cd llama.cpp
./convert_hf_to_gguf.py --help
# outtype使用高精度
./convert_hf_to_gguf.py --outfile Qwen2-0.5B.gguf --outtype f32 ../Qwen2-0.5B
# INFO:hf-to-gguf:Model successfully exported to Qwen2-0.5B.gguf
# 量化模型: https://github.com/ggerganov/llama.cpp/blob/master/examples/quantize/README.md
# 创建 build 文件夹
mkdir build && cd build
cmake .. && make -j40
# 运行 quantize 程序,指定输入和输出的模型文件和量化方式; 推荐使用 Q4_0: https://ollama.qianniu.city/doc/Ollama%20%E5%AF%BC%E5%85%A5%E6%A8%A1%E5%9E%8B.html
./bin/llama-quantize ../Qwen2-0.5B.gguf ../Qwen2-0.5B-Q4_K_M.gguf Q4_K_M
ls -lash Qwen2-0.5B.gguf
# 1.3G -rw-r--r-- 1 root root 949M Jul 22 08:18 Qwen2-0.5B.gguf
ls -lash Qwen2-0.5B-Q4_K_M.gguf
# 380M -rw-r--r-- 1 root root 380M Jul 22 08:32 Qwen2-0.5B-Q4_K_M.gguf
# 可以直接运行api模型服务
./bin/llama-server -m ./qwen2-7b-instruct-q4_0.gguf -ngl 28 -fa --host 0.0.0.0
# 命令行运行推理
./bin/llama-simple -m ../Qwen2-0.5B-Q4_K_M.gguf --prompt "你好,世界。"
./bin/llama-cli -m ./qwen2-7b-instruct-q4_0.gguf -p "你是"
./bin/llama-cli -m ./Qwen2-0.5B-Q4_K_M.gguf -p "你是个乐于助人的助手" -cnv
# ollama导入gguf模型
https://ollama.fan/getting-started/#import-from-gguf
touch Modelfile
# 内容
FROM ./Qwen2-0.5B-Q4_K_M.gguf
# set the temperature to 1 [higher is more creative, lower is more coherent]
PARAMETER temperature 0.7
PARAMETER top_p 0.8
PARAMETER repeat_penalty 1.05
PARAMETER top_k 20
TEMPLATE """{{ if and .First .System }}<|im_start|>system
{{ .System }}<|im_end|>
{{ end }}<|im_start|>user
{{ .Prompt }}<|im_end|>
<|im_start|>assistant
{{ .Response }}"""
# set the system message
SYSTEM """
You are a helpful assistant.
"""
# 从Modelfile创建模型
ollama create my_qall -f Modelfile
# 使用
ollama run my_qall "你是谁?"
ollama run my_qall
ollama show my_qall
ollama show --modelfile my_qall
# 删除
ollama list
ollama rm my_qall:latest
ollama客户端
# ChatBox 桌面+移动端
https://github.com/Bin-Huang/ChatBox
# vscode扩展(使用失败)
saoudrizwan.claude-dev