Copyright (c) 2026 MindMesh Academy. All rights reserved. This content is proprietary and may not be reproduced or distributed without permission.

3.3.2. Retrieval and Grounding

The retrieval step finds relevant documents; grounding ensures the model uses that context rather than hallucinating.

RAG Pattern:
# 1. Generate query embedding
embedding = client.embeddings.create(model="text-embedding-ada-002", input=query).data[0].embedding

# 2. Search for relevant documents
results = search_client.search(search_text=query, vector_queries=[VectorizedQuery(vector=embedding, fields="contentVector")])

# 3. Build prompt with context
context = "\n".join([doc["content"] for doc in results])
messages = [
    {"role": "system", "content": f"Answer based on this context:\n{context}"},
    {"role": "user", "content": query}
]

# 4. Generate grounded response
response = client.chat.completions.create(model="gpt-4o", messages=messages)
Azure OpenAI "On Your Data" (Built-in RAG):
response = client.chat.completions.create(
    model="gpt-4o",
    messages=[{"role": "user", "content": "What are our vacation policies?"}],
    extra_body={
        "data_sources": [{
            "type": "azure_search",
            "parameters": {
                "endpoint": search_endpoint,
                "index_name": index_name,
                "authentication": {"type": "api_key", "key": search_key}
            }
        }]
    }
)
Error Handling Pattern:
from openai import AzureOpenAI, APIError
from azure.core.exceptions import HttpResponseError

def rag_query_with_fallback(query: str) -> str:
    """RAG query with graceful degradation."""
    try:
        # Step 1: Generate embedding
        embedding = client.embeddings.create(
            model="text-embedding-ada-002", 
            input=query
        ).data[0].embedding
    except APIError as e:
        logging.error(f"Embedding generation failed: {e}")
        # Fallback to keyword search only
        embedding = None

    try:
        # Step 2: Search for context
        if embedding:
            results = search_client.search(
                search_text=query,
                vector_queries=[VectorizedQuery(vector=embedding, fields="contentVector")]
            )
        else:
            # Fallback: keyword search without vector
            results = search_client.search(search_text=query)
        
        context = "\n".join([doc["content"] for doc in results])
    except HttpResponseError as e:
        logging.error(f"Search failed: {e}")
        context = ""  # Proceed without context (acknowledge limitation to user)

    try:
        # Step 3: Generate response
        messages = [
            {"role": "system", "content": f"Answer based on: {context}" if context else "Answer to the best of your ability."},
            {"role": "user", "content": query}
        ]
        response = client.chat.completions.create(model="gpt-4o", messages=messages)
        return response.choices[0].message.content
    except APIError as e:
        logging.error(f"Generation failed: {e}")
        return "I'm sorry, I couldn't process your request. Please try again."

⚠️ Exam Trap: RAG implementations should handle failures at each stage (embedding, search, generation) independently with appropriate fallbacks.

Alvin Varughese
Written byAlvin Varughese
Founder15 professional certifications