How Can I Use Run Manager to Stream Response on RetrievalQA?

Ask Question

Asked 13 days ago

Modified 13 days ago

Viewed 6 times

I'm working with the langchain library and transformers to build a language model application. I want to integrate a CallbackManagerForLLMRun to stream responses in my RetrievalQA chain. Below is the code I have so far, including my custom LLAMA class, which uses the /home/llama/LLM/llama/CACHE-Llama-3-8B-chat-merged model from transformers.

from transformers import AutoModelForCausalLM, AutoTokenizer
from abc import ABC
from langchain.llms.base import LLM
from typing import Any, List, Mapping, Optional
from langchain.callbacks.manager import CallbackManagerForLLMRun
device = "cuda" # the device to load the model onto

model = AutoModelForCausalLM.from_pretrained(
    "/home/llama/LLM/llama/CACHE-Llama-3-8B-chat-merged",
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("/home/llama/LLM/llama/CACHE-Llama-3-8B-chat-merged")

class LLAMA(LLM, ABC):
     max_token: int = 10000
     temperature: float = 0.01
     top_p = 0.9
     history_len: int = 3

     def __init__(self):
         super().__init__()

     @property
     def _llm_type(self) -> str:
         return "LLAMA3"

     @property
     def _history_len(self) -> int:
         return self.history_len

     def set_history_len(self, history_len: int = 10) -> None:
         self.history_len = history_len

     def _call(
         self,
         prompt: str,
         stop: Optional[List[str]] = None,
         run_manager: Optional[CallbackManagerForLLMRun] = None,
     ) -> str:
         messages = [
             {"role": "system", "content": "You are a helpful assistant."},
             {"role": "user", "content": prompt}
         ]
         text = tokenizer.apply_chat_template(
             messages,
             tokenize=False,
             add_generation_prompt=True
         )
         model_inputs = tokenizer([text], return_tensors="pt").to(device)
         generated_ids = model.generate(
             model_inputs.input_ids,
             max_new_tokens=512
         )
         generated_ids = [
             output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
         ]

         response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
         return response

     @property
     def _identifying_params(self) -> Mapping[str, Any]:
         """Get the identifying parameters."""
         return {"max_token": self.max_token,
                 "temperature": self.temperature,
                 "top_p": self.top_p,
                 "history_len": self.history_len}
                 
 llm = LLAMA()
    embeddings = HuggingFaceEmbeddings(model_name=embedding_model_dict[EMBEDDING_MODEL],model_kwargs={'device': EMBEDDING_DEVICE})

    docs = load_file(filepath)

    docsearch = FAISSWrapper.from_documents(docs, embeddings)

    prompt = PromptTemplate(
        template=PROMPT_TEMPLATE, input_variables=["context_str", "question"]
    )

    chain_type_kwargs = {"prompt": prompt, "document_variable_name": "context_str"}
    qa = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type=CHAIN_TYPE,
        retriever=docsearch.as_retriever(search_kwargs={"k": VECTOR_SEARCH_TOP_K}),
        chain_type_kwargs=chain_type_kwargs)

    query = "Give me a short introduction to large language model."
    print(qa.run(query))

asked Jul 8 at 17:27

rahul raj

211 silver badge8 bronze badges

Add a comment |

Collectives™ on Stack Overflow

How Can I Use Run Manager to Stream Response on RetrievalQA?

0

Browse other questions tagged
stream
langchain
asynccallback
llama3
or ask your own question.

Hot Network Questions

Collectives™ on Stack Overflow

0

Know someone who can answer? Share a link to this question via email, Twitter, or Facebook.

Browse other questions tagged streamlangchainasynccallbackllama3 or ask your own question.

Browse other questions tagged
stream
langchain
asynccallback
llama3
or ask your own question.