From 90d9aac96cf2c243c1f9f1ad8ac6d5d3f7ef011a Mon Sep 17 00:00:00 2001
From: Major Hayden <major@redhat.com>
Date: Thu, 15 Jan 2026 10:35:51 -0600
Subject: [PATCH 1/5] fix(rlsapi): add RateLimitError and APIStatusError
 handling

Add exception handlers for RateLimitError (429) and APIStatusError (500)
to match the error handling patterns used by other endpoints like query.py
and streaming_query.py.

Previously, these exceptions would result in unhandled errors instead of
proper HTTP error responses.

Signed-off-by: Major Hayden <major@redhat.com>
---
 src/app/endpoints/rlsapi_v1.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/app/endpoints/rlsapi_v1.py b/src/app/endpoints/rlsapi_v1.py
index 72fdf3827..f2b700aa1 100644
--- a/src/app/endpoints/rlsapi_v1.py
+++ b/src/app/endpoints/rlsapi_v1.py
@@ -9,7 +9,7 @@
 
 from fastapi import APIRouter, Depends, HTTPException
 from llama_stack.apis.agents.openai_responses import OpenAIResponseObject
-from llama_stack_client import APIConnectionError
+from llama_stack_client import APIConnectionError, APIStatusError, RateLimitError
 
 import constants
 from authentication import get_auth_dependency
@@ -20,6 +20,8 @@
 from models.config import Action
 from models.responses import (
     ForbiddenResponse,
+    InternalServerErrorResponse,
+    QuotaExceededResponse,
     ServiceUnavailableResponse,
     UnauthorizedResponse,
     UnprocessableEntityResponse,
@@ -40,6 +42,8 @@
     ),
     403: ForbiddenResponse.openapi_response(examples=["endpoint"]),
     422: UnprocessableEntityResponse.openapi_response(),
+    429: QuotaExceededResponse.openapi_response(),
+    500: InternalServerErrorResponse.openapi_response(examples=["generic"]),
     503: ServiceUnavailableResponse.openapi_response(),
 }
 
@@ -161,6 +165,16 @@ async def infer_endpoint(
             cause=str(e),
         )
         raise HTTPException(**response.model_dump()) from e
+    except RateLimitError as e:
+        logger.error("Rate limit exceeded for request %s: %s", request_id, e)
+        response = QuotaExceededResponse(
+            response="The quota has been exceeded", cause=str(e)
+        )
+        raise HTTPException(**response.model_dump()) from e
+    except APIStatusError as e:
+        logger.exception("API error for request %s: %s", request_id, e)
+        response = InternalServerErrorResponse.generic()
+        raise HTTPException(**response.model_dump()) from e
 
     if not response_text:
         logger.warning("Empty response from LLM for request %s", request_id)

From 83f7c4caa2b4508d464a354d5fb52b6cde082d19 Mon Sep 17 00:00:00 2001
From: Major Hayden <major@redhat.com>
Date: Thu, 15 Jan 2026 10:36:34 -0600
Subject: [PATCH 2/5] fix(rlsapi): add metrics increment on LLM call failures

Track LLM call failures in Prometheus metrics by incrementing
llm_calls_failures_total counter in each exception handler, matching
the pattern used by other endpoints.

Signed-off-by: Major Hayden <major@redhat.com>
---
 src/app/endpoints/rlsapi_v1.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/app/endpoints/rlsapi_v1.py b/src/app/endpoints/rlsapi_v1.py
index f2b700aa1..582b9733f 100644
--- a/src/app/endpoints/rlsapi_v1.py
+++ b/src/app/endpoints/rlsapi_v1.py
@@ -12,6 +12,7 @@
 from llama_stack_client import APIConnectionError, APIStatusError, RateLimitError
 
 import constants
+import metrics
 from authentication import get_auth_dependency
 from authentication.interface import AuthTuple
 from authorization.middleware import authorize
@@ -157,6 +158,7 @@ async def infer_endpoint(
     try:
         response_text = await retrieve_simple_response(input_source)
     except APIConnectionError as e:
+        metrics.llm_calls_failures_total.inc()
         logger.error(
             "Unable to connect to Llama Stack for request %s: %s", request_id, e
         )
@@ -166,12 +168,14 @@ async def infer_endpoint(
         )
         raise HTTPException(**response.model_dump()) from e
     except RateLimitError as e:
+        metrics.llm_calls_failures_total.inc()
         logger.error("Rate limit exceeded for request %s: %s", request_id, e)
         response = QuotaExceededResponse(
             response="The quota has been exceeded", cause=str(e)
         )
         raise HTTPException(**response.model_dump()) from e
     except APIStatusError as e:
+        metrics.llm_calls_failures_total.inc()
         logger.exception("API error for request %s: %s", request_id, e)
         response = InternalServerErrorResponse.generic()
         raise HTTPException(**response.model_dump()) from e

From 3fc7bd069611ba6285c2eb1d669e54f6b58ed2cc Mon Sep 17 00:00:00 2001
From: Major Hayden <major@redhat.com>
Date: Thu, 15 Jan 2026 10:37:14 -0600
Subject: [PATCH 3/5] docs(rlsapi): document skip_rag field as reserved for
 future use

The skip_rag field in RlsapiV1InferRequest is accepted but not yet
implemented. Update the field description to clarify this for API
consumers.

Signed-off-by: Major Hayden <major@redhat.com>
---
 src/models/rlsapi/requests.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/models/rlsapi/requests.py b/src/models/rlsapi/requests.py
index 5bb05b782..fc0f7724a 100644
--- a/src/models/rlsapi/requests.py
+++ b/src/models/rlsapi/requests.py
@@ -126,7 +126,7 @@ class RlsapiV1InferRequest(ConfigurationBase):
     Attributes:
         question: User question string.
         context: Context with system info, terminal output, etc. (defaults provided).
-        skip_rag: Whether to skip RAG retrieval (default False).
+        skip_rag: Reserved for future use. RAG retrieval is not yet implemented.
 
     Example:
         ```python
@@ -152,7 +152,7 @@ class RlsapiV1InferRequest(ConfigurationBase):
     )
     skip_rag: bool = Field(
         default=False,
-        description="Whether to skip RAG retrieval",
+        description="Reserved for future use. RAG retrieval is not yet implemented.",
         examples=[False, True],
     )
 

From 685ee3ac2da3b6dbc6d218840da8392dd3490254 Mon Sep 17 00:00:00 2001
From: Major Hayden <major@redhat.com>
Date: Thu, 15 Jan 2026 10:38:41 -0600
Subject: [PATCH 4/5] feat(rlsapi): include system info context in LLM
 instructions

Enhance the LLM instructions with the user's RHEL system information
(OS, version, architecture) when available. This gives the LLM better
context about the environment the user is asking questions about,
enabling more relevant and accurate responses.

Signed-off-by: Major Hayden <major@redhat.com>
---
 src/app/endpoints/rlsapi_v1.py             | 40 +++++++++--
 tests/unit/app/endpoints/test_rlsapi_v1.py | 79 ++++++++++++++++++----
 2 files changed, 101 insertions(+), 18 deletions(-)

diff --git a/src/app/endpoints/rlsapi_v1.py b/src/app/endpoints/rlsapi_v1.py
index 582b9733f..a7d8a4f11 100644
--- a/src/app/endpoints/rlsapi_v1.py
+++ b/src/app/endpoints/rlsapi_v1.py
@@ -27,7 +27,7 @@
     UnauthorizedResponse,
     UnprocessableEntityResponse,
 )
-from models.rlsapi.requests import RlsapiV1InferRequest
+from models.rlsapi.requests import RlsapiV1InferRequest, RlsapiV1SystemInfo
 from models.rlsapi.responses import RlsapiV1InferData, RlsapiV1InferResponse
 from utils.responses import extract_text_from_response_output_item
 from utils.suid import get_suid
@@ -49,6 +49,35 @@
 }
 
 
+def _build_instructions(systeminfo: RlsapiV1SystemInfo) -> str:
+    """Build LLM instructions incorporating system context when available.
+
+    Enhances the default system prompt with RHEL system information to provide
+    the LLM with relevant context about the user's environment.
+
+    Args:
+        systeminfo: System information from the client (OS, version, arch).
+
+    Returns:
+        Instructions string for the LLM, with system context if available.
+    """
+    base_prompt = constants.DEFAULT_SYSTEM_PROMPT
+
+    context_parts = []
+    if systeminfo.os:
+        context_parts.append(f"OS: {systeminfo.os}")
+    if systeminfo.version:
+        context_parts.append(f"Version: {systeminfo.version}")
+    if systeminfo.arch:
+        context_parts.append(f"Architecture: {systeminfo.arch}")
+
+    if not context_parts:
+        return base_prompt
+
+    system_context = ", ".join(context_parts)
+    return f"{base_prompt}\n\nUser's system: {system_context}"
+
+
 def _get_default_model_id() -> str:
     """Get the default model ID from configuration.
 
@@ -82,7 +111,7 @@ def _get_default_model_id() -> str:
     )
 
 
-async def retrieve_simple_response(question: str) -> str:
+async def retrieve_simple_response(question: str, instructions: str) -> str:
     """Retrieve a simple response from the LLM for a stateless query.
 
     Uses the Responses API for simple stateless inference, consistent with
@@ -90,6 +119,7 @@ async def retrieve_simple_response(question: str) -> str:
 
     Args:
         question: The combined user input (question + context).
+        instructions: System instructions for the LLM.
 
     Returns:
         The LLM-generated response text.
@@ -106,7 +136,7 @@ async def retrieve_simple_response(question: str) -> str:
     response = await client.responses.create(
         input=question,
         model=model_id,
-        instructions=constants.DEFAULT_SYSTEM_PROMPT,
+        instructions=instructions,
         stream=False,
         store=False,
     )
@@ -149,14 +179,14 @@ async def infer_endpoint(
 
     logger.info("Processing rlsapi v1 /infer request %s", request_id)
 
-    # Combine all input sources (question, stdin, attachments, terminal)
     input_source = infer_request.get_input_source()
+    instructions = _build_instructions(infer_request.context.systeminfo)
     logger.debug(
         "Request %s: Combined input source length: %d", request_id, len(input_source)
     )
 
     try:
-        response_text = await retrieve_simple_response(input_source)
+        response_text = await retrieve_simple_response(input_source, instructions)
     except APIConnectionError as e:
         metrics.llm_calls_failures_total.inc()
         logger.error(
diff --git a/tests/unit/app/endpoints/test_rlsapi_v1.py b/tests/unit/app/endpoints/test_rlsapi_v1.py
index 873513985..8ec2cab5a 100644
--- a/tests/unit/app/endpoints/test_rlsapi_v1.py
+++ b/tests/unit/app/endpoints/test_rlsapi_v1.py
@@ -13,6 +13,7 @@
 
 import constants
 from app.endpoints.rlsapi_v1 import (
+    _build_instructions,
     _get_default_model_id,
     infer_endpoint,
     retrieve_simple_response,
@@ -87,6 +88,12 @@ def mock_empty_llm_response_fixture(mocker: MockerFixture) -> None:
     _setup_responses_mock(mocker, mocker.AsyncMock(return_value=mock_response))
 
 
+@pytest.fixture(name="mock_auth_resolvers")
+def mock_auth_resolvers_fixture(mocker: MockerFixture) -> None:
+    """Mock authorization resolvers for endpoint tests."""
+    mock_authorization_resolvers(mocker)
+
+
 @pytest.fixture(name="mock_api_connection_error")
 def mock_api_connection_error_fixture(mocker: MockerFixture) -> None:
     """Mock responses.create() to raise APIConnectionError."""
@@ -96,6 +103,47 @@ def mock_api_connection_error_fixture(mocker: MockerFixture) -> None:
     )
 
 
+# --- Test _build_instructions ---
+
+
+@pytest.mark.parametrize(
+    ("systeminfo_kwargs", "expected_contains", "expected_not_contains"),
+    [
+        pytest.param(
+            {"os": "RHEL", "version": "9.3", "arch": "x86_64"},
+            ["OS: RHEL", "Version: 9.3", "Architecture: x86_64"],
+            [],
+            id="full_systeminfo",
+        ),
+        pytest.param(
+            {"os": "RHEL", "version": "", "arch": ""},
+            ["OS: RHEL"],
+            ["Version:", "Architecture:"],
+            id="partial_systeminfo",
+        ),
+        pytest.param(
+            {},
+            [constants.DEFAULT_SYSTEM_PROMPT],
+            ["OS:", "Version:", "Architecture:"],
+            id="empty_systeminfo",
+        ),
+    ],
+)
+def test_build_instructions(
+    systeminfo_kwargs: dict[str, str],
+    expected_contains: list[str],
+    expected_not_contains: list[str],
+) -> None:
+    """Test _build_instructions with various system info combinations."""
+    systeminfo = RlsapiV1SystemInfo(**systeminfo_kwargs)
+    result = _build_instructions(systeminfo)
+
+    for expected in expected_contains:
+        assert expected in result
+    for not_expected in expected_not_contains:
+        assert not_expected not in result
+
+
 # --- Test _get_default_model_id ---
 
 
@@ -151,7 +199,9 @@ async def test_retrieve_simple_response_success(
     mock_configuration: AppConfig, mock_llm_response: None
 ) -> None:
     """Test retrieve_simple_response returns LLM response text."""
-    response = await retrieve_simple_response("How do I list files?")
+    response = await retrieve_simple_response(
+        "How do I list files?", constants.DEFAULT_SYSTEM_PROMPT
+    )
     assert response == "This is a test LLM response."
 
 
@@ -160,7 +210,9 @@ async def test_retrieve_simple_response_empty_output(
     mock_configuration: AppConfig, mock_empty_llm_response: None
 ) -> None:
     """Test retrieve_simple_response handles empty LLM output."""
-    response = await retrieve_simple_response("Test question")
+    response = await retrieve_simple_response(
+        "Test question", constants.DEFAULT_SYSTEM_PROMPT
+    )
     assert response == ""
 
 
@@ -170,7 +222,7 @@ async def test_retrieve_simple_response_api_connection_error(
 ) -> None:
     """Test retrieve_simple_response propagates APIConnectionError."""
     with pytest.raises(APIConnectionError):
-        await retrieve_simple_response("Test question")
+        await retrieve_simple_response("Test question", constants.DEFAULT_SYSTEM_PROMPT)
 
 
 # --- Test infer_endpoint ---
@@ -178,10 +230,11 @@ async def test_retrieve_simple_response_api_connection_error(
 
 @pytest.mark.asyncio
 async def test_infer_minimal_request(
-    mocker: MockerFixture, mock_configuration: AppConfig, mock_llm_response: None
+    mock_configuration: AppConfig,
+    mock_llm_response: None,
+    mock_auth_resolvers: None,
 ) -> None:
     """Test /infer endpoint returns valid response with LLM text."""
-    mock_authorization_resolvers(mocker)
     request = RlsapiV1InferRequest(question="How do I list files?")
 
     response = await infer_endpoint(infer_request=request, auth=MOCK_AUTH)
@@ -194,10 +247,11 @@ async def test_infer_minimal_request(
 
 @pytest.mark.asyncio
 async def test_infer_full_context_request(
-    mocker: MockerFixture, mock_configuration: AppConfig, mock_llm_response: None
+    mock_configuration: AppConfig,
+    mock_llm_response: None,
+    mock_auth_resolvers: None,
 ) -> None:
     """Test /infer endpoint handles full context (stdin, attachments, terminal)."""
-    mock_authorization_resolvers(mocker)
     request = RlsapiV1InferRequest(
         question="Why did this command fail?",
         context=RlsapiV1Context(
@@ -217,10 +271,11 @@ async def test_infer_full_context_request(
 
 @pytest.mark.asyncio
 async def test_infer_generates_unique_request_ids(
-    mocker: MockerFixture, mock_configuration: AppConfig, mock_llm_response: None
+    mock_configuration: AppConfig,
+    mock_llm_response: None,
+    mock_auth_resolvers: None,
 ) -> None:
     """Test that each /infer call generates a unique request_id."""
-    mock_authorization_resolvers(mocker)
     request = RlsapiV1InferRequest(question="How do I list files?")
 
     response1 = await infer_endpoint(infer_request=request, auth=MOCK_AUTH)
@@ -231,12 +286,11 @@ async def test_infer_generates_unique_request_ids(
 
 @pytest.mark.asyncio
 async def test_infer_api_connection_error_returns_503(
-    mocker: MockerFixture,
     mock_configuration: AppConfig,
     mock_api_connection_error: None,
+    mock_auth_resolvers: None,
 ) -> None:
     """Test /infer endpoint returns 503 when LLM service is unavailable."""
-    mock_authorization_resolvers(mocker)
     request = RlsapiV1InferRequest(question="Test question")
 
     with pytest.raises(HTTPException) as exc_info:
@@ -247,12 +301,11 @@ async def test_infer_api_connection_error_returns_503(
 
 @pytest.mark.asyncio
 async def test_infer_empty_llm_response_returns_fallback(
-    mocker: MockerFixture,
     mock_configuration: AppConfig,
     mock_empty_llm_response: None,
+    mock_auth_resolvers: None,
 ) -> None:
     """Test /infer endpoint returns fallback text when LLM returns empty response."""
-    mock_authorization_resolvers(mocker)
     request = RlsapiV1InferRequest(question="Test question")
 
     response = await infer_endpoint(infer_request=request, auth=MOCK_AUTH)

From 48a7c8fda5464fa6fca53c368a821430ee549db1 Mon Sep 17 00:00:00 2001
From: Major Hayden <major@redhat.com>
Date: Thu, 15 Jan 2026 12:30:21 -0600
Subject: [PATCH 5/5] test(rlsapi): standardize MOCK_AUTH constant in rlsapi
 tests

Use project-standard mock auth tuple pattern with non-admin flag.

Signed-off-by: Major Hayden <major@redhat.com>
---
 tests/unit/app/endpoints/test_rlsapi_v1.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/app/endpoints/test_rlsapi_v1.py b/tests/unit/app/endpoints/test_rlsapi_v1.py
index 8ec2cab5a..1bfae9225 100644
--- a/tests/unit/app/endpoints/test_rlsapi_v1.py
+++ b/tests/unit/app/endpoints/test_rlsapi_v1.py
@@ -31,7 +31,7 @@
 from tests.unit.utils.auth_helpers import mock_authorization_resolvers
 from utils.suid import check_suid
 
-MOCK_AUTH: AuthTuple = ("test_user_id", "test_user", True, "test_token")
+MOCK_AUTH: AuthTuple = ("mock_user_id", "mock_username", False, "mock_token")
 
 
 def _setup_responses_mock(mocker: MockerFixture, create_behavior: Any) -> None: