I'm working with the langchain library and transformers to build a language model application. I want to integrate a CallbackManagerForLLMRun
to stream responses in my RetrievalQA chain. Below is the code I have so far, including my custom LLAMA
class, which uses the /home/llama/LLM/llama/CACHE-Llama-3-8B-chat-merged
model from transformers.
from transformers import AutoModelForCausalLM, AutoTokenizer
from abc import ABC
from langchain.llms.base import LLM
from typing import Any, List, Mapping, Optional
from langchain.callbacks.manager import CallbackManagerForLLMRun
device = "cuda" # the device to load the model onto
model = AutoModelForCausalLM.from_pretrained(
"/home/llama/LLM/llama/CACHE-Llama-3-8B-chat-merged",
torch_dtype="auto",
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("/home/llama/LLM/llama/CACHE-Llama-3-8B-chat-merged")
class LLAMA(LLM, ABC):
max_token: int = 10000
temperature: float = 0.01
top_p = 0.9
history_len: int = 3
def __init__(self):
super().__init__()
@property
def _llm_type(self) -> str:
return "LLAMA3"
@property
def _history_len(self) -> int:
return self.history_len
def set_history_len(self, history_len: int = 10) -> None:
self.history_len = history_len
def _call(
self,
prompt: str,
stop: Optional[List[str]] = None,
run_manager: Optional[CallbackManagerForLLMRun] = None,
) -> str:
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)
generated_ids = model.generate(
model_inputs.input_ids,
max_new_tokens=512
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
return response
@property
def _identifying_params(self) -> Mapping[str, Any]:
"""Get the identifying parameters."""
return {"max_token": self.max_token,
"temperature": self.temperature,
"top_p": self.top_p,
"history_len": self.history_len}
llm = LLAMA()
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_dict[EMBEDDING_MODEL],model_kwargs={'device': EMBEDDING_DEVICE})
docs = load_file(filepath)
docsearch = FAISSWrapper.from_documents(docs, embeddings)
prompt = PromptTemplate(
template=PROMPT_TEMPLATE, input_variables=["context_str", "question"]
)
chain_type_kwargs = {"prompt": prompt, "document_variable_name": "context_str"}
qa = RetrievalQA.from_chain_type(
llm=llm,
chain_type=CHAIN_TYPE,
retriever=docsearch.as_retriever(search_kwargs={"k": VECTOR_SEARCH_TOP_K}),
chain_type_kwargs=chain_type_kwargs)
query = "Give me a short introduction to large language model."
print(qa.run(query))