sudo xcode-select --reset
xcode-select --install
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
mkdir build
cd build
cmake .. -DCMAKE_APPLE_SILICON_PROCESSOR=arm64
make -j
./build/bin/main --color --model "/Volumes/GregSSD/LLM/mistral7b/mistral-7b-instruct-v0.1.Q6_K.gguf" -t 7 -b 24 -n -1 --temp 0 -ngl 1 -ins
llama_print_timings: load time = 30830.61 ms
llama_print_timings: sample time = 705.00 ms / 474 runs ( 1.49 ms per token, 672.34 tokens per second)
llama_print_timings: prompt eval time = 11926.32 ms / 31 tokens ( 384.72 ms per token, 2.60 tokens per second)
llama_print_timings: eval time = 25413.28 ms / 475 runs ( 53.50 ms per token, 18.69 tokens per second)
llama_print_timings: total time = 190365.77 ms
CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install --upgrade --force-reinstall llama-cpp-python --no-cache-dir
from llama_cpp import Llama
model = "/Volumes/GregSSD/LLM/mistral7b/mistral-7b-instruct-v0.1.Q6_K.gguf" # instruction model
llm = Llama(model_path=model, n_ctx=8192, n_batch=512, n_threads=7, n_gpu_layers=2, verbose=True, seed=42)
system = """
Follow the instructions below to complete the task.
"""
user = """
Create a PHP script to scan a directory and print the contents of the directory.
"""
message = f"<s>[INST] {system} [/INST]</s>{user}"
output = llm(message, echo=True, stream=False, max_tokens=4096)
print(output['usage'])
output = output['choices'][0]['text'].replace(message, '')
print(output)