diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 4e9cd336..466ddb34 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,23 +1,24 @@ // For format details, see https://aka.ms/devcontainer.json. For config options, see the // README at: https://github.com/devcontainers/templates/tree/main/src/python { - "name": "Python 3", + "name": "Data Formulator Dev", // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile - "image": "mcr.microsoft.com/devcontainers/python:1-3.12-bullseye", + "image": "mcr.microsoft.com/devcontainers/python:1-3.11-bullseye", // Features to add to the dev container. More info: https://containers.dev/features. - "features": { - "ghcr.io/devcontainers/features/node:1": { - "version": "18" - }, - "ghcr.io/devcontainers/features/azure-cli:1": {} - }, + "features": { + "ghcr.io/devcontainers/features/node:1": { + "version": "18" + }, + "ghcr.io/devcontainers/features/azure-cli:1": {}, + "ghcr.io/astral-sh/uv:1": {} + }, // Use 'forwardPorts' to make a list of ports inside the container available locally. - // "forwardPorts": [], + "forwardPorts": [5000, 5173], // Use 'postCreateCommand' to run commands after the container is created. - "postCreateCommand": "cd /workspaces/data-formulator && npm install && npm run build && python3 -m venv /workspaces/data-formulator/venv && . /workspaces/data-formulator/venv/bin/activate && pip install -e /workspaces/data-formulator --verbose && data_formulator" + "postCreateCommand": "cd /workspaces/data-formulator && npm install && npm run build && uv sync && uv run data_formulator" // Configure tool-specific properties. // "customizations": {}, diff --git a/.gitignore b/.gitignore index f3420acd..6b8ca6c3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ *env +.venv/ *api-keys.env **/*.ipynb_checkpoints/ .DS_Store diff --git a/.python-version b/.python-version new file mode 100644 index 00000000..2c073331 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.11 diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 6467a874..2a16d785 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -2,16 +2,34 @@ How to set up your local machine. ## Prerequisites -* Python > 3.11 +* Python >= 3.11 * Node.js * Yarn +* [uv](https://docs.astral.sh/uv/) (recommended) or pip ## Backend (Python) +### Option 1: With uv (recommended) + +uv is faster and provides reproducible builds via lockfile. + +```bash +uv sync # Creates .venv and installs all dependencies +uv run data_formulator # Run app (opens browser automatically) +uv run data_formulator --dev # Run backend only (for frontend development) +``` + +**Which command to use:** +- **End users / testing the full app**: `uv run data_formulator` - starts server and opens browser to http://localhost:5000 +- **Frontend development**: `uv run data_formulator --dev` - starts backend server only, then run `yarn start` separately for the Vite dev server on http://localhost:5173 + +### Option 2: With pip (fallback) + - **Create a Virtual Environment** ```bash python -m venv venv - .\venv\Scripts\activate + source venv/bin/activate # Unix + # or .\venv\Scripts\activate # Windows ``` - **Install Dependencies** @@ -41,14 +59,16 @@ How to set up your local machine. - **Run the app** - - **Windows** - ```bash - .\local_server.bat - ``` - - - **Unix-based** ```bash + # Unix ./local_server.sh + + # Windows + .\local_server.bat + + # Or directly + data_formulator # Opens browser automatically + data_formulator --dev # Backend only (for frontend development) ``` ## Frontend (TypeScript) @@ -61,7 +81,12 @@ How to set up your local machine. - **Development mode** - Run the front-end in development mode using, allowing real-time edits and previews: + First, start the backend server (in a separate terminal): + ```bash + uv run data_formulator --dev # or ./local_server.sh + ``` + + Then, run the frontend in development mode with hot reloading: ```bash yarn start ``` @@ -81,6 +106,10 @@ How to set up your local machine. Then, build python package: ```bash + # With uv + uv build + + # Or with pip pip install build python -m build ``` @@ -116,9 +145,10 @@ When deploying Data Formulator to production, please be aware of the following s 1. **Local DuckDB Files**: When database functionality is enabled (default), Data Formulator stores DuckDB database files locally on the server. These files contain user data and are stored in the system's temporary directory or a configured `LOCAL_DB_DIR`. -2. **Session Management**: - - When database is **enabled**: Session IDs are stored in Flask sessions (cookies) and linked to local DuckDB files - - When database is **disabled**: No persistent storage is used, and no cookies are set. Session IDs are generated per request for API consistency +2. **Identity Management**: + - Each user's data is isolated by a namespaced identity key (e.g., `user:alice@example.com` or `browser:550e8400-...`) + - Anonymous users get a browser-based UUID stored in localStorage + - Authenticated users get their verified user ID from the auth provider 3. **Data Persistence**: User data processed through Data Formulator may be temporarily stored in these local DuckDB files, which could be a security risk in multi-tenant environments. @@ -142,5 +172,90 @@ For production deployment, consider: python -m data_formulator.app --disable-database ``` +## Authentication Architecture + +Data Formulator supports a **hybrid identity system** that supports both anonymous and authenticated users. + +### Identity Flow Overview + +``` +┌─────────────────────────────────────────────────────────────────────┐ +│ Frontend Request │ +├─────────────────────────────────────────────────────────────────────┤ +│ Headers: │ +│ X-Identity-Id: "browser:550e8400-..." (namespace sent by client) │ +│ Authorization: Bearer (if custom auth implemented) │ +│ (Azure also adds X-MS-CLIENT-PRINCIPAL-ID automatically) │ +└─────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────┐ +│ Backend Identity Resolution │ +│ (auth.py: get_identity_id) │ +├─────────────────────────────────────────────────────────────────────┤ +│ Priority 1: Azure X-MS-CLIENT-PRINCIPAL-ID → "user:" │ +│ Priority 2: JWT Bearer token (if implemented) → "user:" │ +│ Priority 3: X-Identity-Id header → ALWAYS "browser:" │ +│ (client-provided namespace is IGNORED for security) │ +└─────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────┐ +│ Storage Isolation │ +├─────────────────────────────────────────────────────────────────────┤ +│ "user:alice@example.com" → alice's DuckDB file (ONLY via auth) │ +│ "browser:550e8400-..." → anonymous user's DuckDB file │ +└─────────────────────────────────────────────────────────────────────┘ +``` + +### Security Model + +**Critical Security Rule:** The backend NEVER trusts the namespace prefix from the client-provided `X-Identity-Id` header. Even if a client sends `X-Identity-Id: "user:alice@..."`, the backend strips the prefix and forces `browser:alice@...`. Only verified authentication (Azure headers or JWT) can result in a `user:` prefixed identity. + +The key security principle is **namespaced isolation with forced prefixing**: + +| Scenario | X-Identity-Id Sent | Backend Resolution | Storage Key | +|----------|-------------------|-------------------|-------------| +| Anonymous user | `browser:550e8400-...` | Strips prefix, forces `browser:` | `browser:550e8400-...` | +| Azure logged-in user | `browser:550e8400-...` | Uses Azure header (priority 1) | `user:alice@...` | +| Attacker spoofing | `user:alice@...` (forged) | No valid auth, strips & forces `browser:` | `browser:alice@...` | + +**Why this is secure:** An attacker sending `X-Identity-Id: user:alice@...` gets `browser:alice@...` as their storage key, which is completely separate from the real `user:alice@...` that only authenticated Alice can access. + +### Implementing Custom Authentication + +To add JWT-based authentication: + +1. **Backend** (`tables_routes.py`): Uncomment and configure the JWT verification code in `get_identity_id()` +2. **Frontend** (`utils.tsx`): Implement `getAuthToken()` to retrieve the JWT from your auth context +3. **Add JWT secret** to Flask config: `current_app.config['JWT_SECRET']` + +### Azure App Service Authentication + +When deployed to Azure with EasyAuth enabled: +- Azure automatically adds `X-MS-CLIENT-PRINCIPAL-ID` header to authenticated requests +- The backend reads this header first (highest priority) +- No frontend changes needed - Azure handles the auth flow + +### Frontend Identity Management + +The frontend (`src/app/identity.ts`) manages identity as follows: + +```typescript +// Identity is always initialized with browser ID +identity: { type: 'browser', id: getBrowserId() } + +// If user logs in (e.g., via Azure), it's updated to: +identity: { type: 'user', id: userInfo.userId } + +// All API requests send namespaced identity: +// X-Identity-Id: "browser:550e8400-..." or "user:alice@..." +``` + +This ensures: +1. **Anonymous users**: Work immediately with localStorage-based browser ID +2. **Logged-in users**: Get their verified user ID from the auth provider +3. **Cross-tab consistency**: Browser ID is shared via localStorage across all tabs + ## Usage See the [Usage section on the README.md page](README.md#usage). diff --git a/README.md b/README.md index 5b4f9e84..db7ff79a 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@

Try Online Demo   - Install Locally + Install Locally

@@ -32,6 +32,9 @@ https://github.com/user-attachments/assets/8ca57b68-4d7a-42cb-bcce-43f8b1681ce2 ## News 🔥🔥🔥 +[01-31-2025] **uv support** — Faster installation with uv +- 🚀 **Install with uv**: Data Formulator now supports installation via [uv](https://docs.astral.sh/uv/), the ultra-fast Python package manager. Get started in seconds with `uvx data_formulator` or `uv pip install data_formulator`. + [01-25-2025] **Data Formulator 0.6** — Real-time insights from live data - ⚡ **Connect to live data**: Connect to URLs and databases with automatic refresh intervals. Visualizations update automatically as your data changes to provide you live insights. [Demo: track international space station position speed live](https://github.com/microsoft/data-formulator/releases/tag/0.6) - 🎨 **UI Updates**: Unified UI for data loading; direct drag-and-drop fields from the data table to update visualization designs. @@ -127,9 +130,30 @@ Data Formulator enables analysts to iteratively explore and visualize data. Star Play with Data Formulator with one of the following options: -- **Option 1: Install via Python PIP** +- **Option 1: Install via uv (recommended)** + + [uv](https://docs.astral.sh/uv/) is an extremely fast Python package manager. If you have uv installed, you can run Data Formulator directly without any setup: + + ```bash + # Run data formulator directly (no install needed) + uvx data_formulator + ``` + + Or install it in a project/virtual environment: + + ```bash + # Install data_formulator + uv pip install data_formulator + + # Run data formulator + python -m data_formulator + ``` + + Data Formulator will be automatically opened in the browser at [http://localhost:5000](http://localhost:5000). + +- **Option 2: Install via pip** - Use Python PIP for an easy setup experience, running locally (recommend: install it in a virtual environment). + Use pip for installation (recommend: install it in a virtual environment). ```bash # install data_formulator @@ -143,13 +167,13 @@ Play with Data Formulator with one of the following options: *you can specify the port number (e.g., 8080) by `python -m data_formulator --port 8080` if the default port is occupied.* -- **Option 2: Codespaces (5 minutes)** +- **Option 3: Codespaces (5 minutes)** You can also run Data Formulator in Codespaces; we have everything pre-configured. For more details, see [CODESPACES.md](CODESPACES.md). [![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/microsoft/data-formulator?quickstart=1) -- **Option 3: Working in the developer mode** +- **Option 4: Working in the developer mode** You can build Data Formulator locally if you prefer full control over your development environment and develop your own version on top. For detailed instructions, refer to [DEVELOPMENT.md](DEVELOPMENT.md). diff --git a/local_server.bat b/local_server.bat index b585d712..36026cf9 100644 --- a/local_server.bat +++ b/local_server.bat @@ -7,4 +7,11 @@ :: set https_proxy=http://127.0.0.1:7890 set FLASK_RUN_PORT=5000 -python -m py-src.data_formulator.app --port %FLASK_RUN_PORT% --dev + +:: Use uv if available, otherwise fall back to python +where uv >nul 2>nul +if %ERRORLEVEL% EQU 0 ( + uv run data_formulator --port %FLASK_RUN_PORT% --dev +) else ( + python -m data_formulator.app --port %FLASK_RUN_PORT% --dev +) diff --git a/local_server.sh b/local_server.sh index 0df7db89..fbba1e3b 100644 --- a/local_server.sh +++ b/local_server.sh @@ -5,6 +5,11 @@ # export http_proxy=http://127.0.0.1:7890 # export https_proxy=http://127.0.0.1:7890 -#env FLASK_APP=py-src/data_formulator/app.py FLASK_RUN_PORT=5000 FLASK_RUN_HOST=0.0.0.0 flask run export FLASK_RUN_PORT=5000 -python -m py-src.data_formulator.app --port ${FLASK_RUN_PORT} --dev \ No newline at end of file + +# Use uv if available, otherwise fall back to python +if command -v uv &> /dev/null; then + uv run data_formulator --port ${FLASK_RUN_PORT} --dev +else + python -m data_formulator.app --port ${FLASK_RUN_PORT} --dev +fi \ No newline at end of file diff --git a/public/screenshot-stock-price-live.webp b/public/screenshot-stock-price-live.webp new file mode 100644 index 00000000..b0ebe71b Binary files /dev/null and b/public/screenshot-stock-price-live.webp differ diff --git a/py-src/data_formulator/__init__.py b/py-src/data_formulator/__init__.py index 2f2fd61f..ee0d133d 100644 --- a/py-src/data_formulator/__init__.py +++ b/py-src/data_formulator/__init__.py @@ -3,7 +3,7 @@ def run_app(): """Launch the Data Formulator Flask application.""" - # Import app only when actually running to avoid side effects + # Import app only when actually running to avoid heavy imports at package load from data_formulator.app import run_app as _run_app return _run_app() diff --git a/py-src/data_formulator/agent_routes.py b/py-src/data_formulator/agent_routes.py index 3de374bb..8103bcdd 100644 --- a/py-src/data_formulator/agent_routes.py +++ b/py-src/data_formulator/agent_routes.py @@ -12,7 +12,7 @@ mimetypes.add_type('application/javascript', '.mjs') import flask -from flask import request, session, jsonify, Blueprint, current_app, Response, stream_with_context +from flask import request, jsonify, Blueprint, current_app, Response, stream_with_context import logging import json @@ -28,6 +28,7 @@ from data_formulator.agents.agent_sql_data_rec import SQLDataRecAgent from data_formulator.agents.agent_sort_data import SortDataAgent +from data_formulator.auth import get_identity_id from data_formulator.agents.agent_data_load import DataLoadAgent from data_formulator.agents.agent_data_clean import DataCleanAgent from data_formulator.agents.agent_data_clean_stream import DataCleanAgentStream @@ -180,10 +181,8 @@ def process_data_on_load_request(): logger.info(f" model: {content['model']}") - try: - conn = db_manager.get_connection(session['session_id']) - except Exception as e: - conn = None + identity_id = get_identity_id() + conn = db_manager.get_connection(identity_id) agent = DataLoadAgent(client=client, conn=conn) @@ -395,7 +394,8 @@ def derive_data(): if chart_encodings == {}: mode = "recommendation" - conn = db_manager.get_connection(session['session_id']) if language == "sql" else None + identity_id = get_identity_id() + conn = db_manager.get_connection(identity_id) if language == "sql" else None if mode == "recommendation": # now it's in recommendation mode @@ -465,7 +465,8 @@ def generate(): "api_version": content['model'].get('api_version', '') } - session_id = session.get('session_id') if language == "sql" else None + # Get identity for SQL mode database connections + identity_id = get_identity_id() if language == "sql" else None exec_python_in_subprocess = current_app.config['CLI_ARGS']['exec_python_in_subprocess'] try: @@ -474,7 +475,7 @@ def generate(): input_tables=input_tables, initial_plan=initial_plan, language=language, - session_id=session_id, + session_id=identity_id, exec_python_in_subprocess=exec_python_in_subprocess, max_iterations=max_iterations, max_repair_attempts=max_repair_attempts, @@ -563,7 +564,8 @@ def refine_data(): logger.info(chart_encodings) logger.info(new_instruction) - conn = db_manager.get_connection(session['session_id']) if language == "sql" else None + identity_id = get_identity_id() + conn = db_manager.get_connection(identity_id) if language == "sql" else None # always resort to the data transform agent agent = SQLDataTransformationAgent(client=client, conn=conn, agent_coding_rules=agent_coding_rules) if language == "sql" else PythonDataTransformationAgent(client=client, exec_python_in_subprocess=current_app.config['CLI_ARGS']['exec_python_in_subprocess'], agent_coding_rules=agent_coding_rules) @@ -626,7 +628,8 @@ def generate(): language = content.get("language", "python") if language == "sql": - db_conn = db_manager.get_connection(session['session_id']) + identity_id = get_identity_id() + db_conn = db_manager.get_connection(identity_id) else: db_conn = None @@ -677,7 +680,8 @@ def generate(): language = content.get("language", "python") if language == "sql": - db_conn = db_manager.get_connection(session['session_id']) + identity_id = get_identity_id() + db_conn = db_manager.get_connection(identity_id) else: db_conn = None diff --git a/py-src/data_formulator/agents/client_utils.py b/py-src/data_formulator/agents/client_utils.py index 4ccce9d0..43cf0ee3 100644 --- a/py-src/data_formulator/agents/client_utils.py +++ b/py-src/data_formulator/agents/client_utils.py @@ -1,27 +1,7 @@ import litellm import openai from azure.identity import DefaultAzureCredential, get_bearer_token_provider -from typing import Dict, Optional, Union -class OpenAIClientAdapter(object): - """ - Wrapper around OpenAI or AzureOpenAI client that provides the same interface as Client. - """ - def __init__(self, openai_client: Union[openai.OpenAI, openai.AzureOpenAI], model: str): - self._openai_client = openai_client - self.model = model - self.params = {} - - def get_completion(self, messages): - """ - Returns a completion using the wrapped OpenAI client. - """ - completion_params = { - "model": self.model, - "messages": messages, - } - - return self._openai_client.chat.completions.create(**completion_params) class Client(object): """ @@ -69,7 +49,7 @@ def __init__(self, endpoint, model, api_key=None, api_base=None, api_version=No self.model = f"ollama/{model}" @classmethod - def from_config(cls, model_config: Dict[str, str]): + def from_config(cls, model_config: dict[str, str]): """ Create a client instance from model configuration. @@ -132,7 +112,7 @@ def get_completion(self, messages, stream=False): ) - def get_response(self, messages: list[dict], tools: Optional[list] = None): + def get_response(self, messages: list[dict], tools: list | None = None): """ Returns a response using OpenAI's Response API approach. """ diff --git a/py-src/data_formulator/agents/web_utils.py b/py-src/data_formulator/agents/web_utils.py index 1fd3aaea..a04f6f48 100644 --- a/py-src/data_formulator/agents/web_utils.py +++ b/py-src/data_formulator/agents/web_utils.py @@ -3,7 +3,6 @@ import requests from bs4 import BeautifulSoup -from typing import Optional, Union import logging from urllib.parse import urlparse import tempfile @@ -111,7 +110,7 @@ def _validate_url_for_ssrf(url: str) -> str: return url -def download_html_content(url: str, timeout: int = 30, headers: Optional[dict] = None) -> str: +def download_html_content(url: str, timeout: int = 30, headers: dict | None = None) -> str: """ Download HTML content from a given URL with SSRF protection. @@ -254,7 +253,7 @@ def html_to_text(html_content: str, remove_scripts: bool = True, remove_styles: # Fallback: return the raw content if parsing fails return html_content -def get_html_title(html_content: str) -> Optional[str]: +def get_html_title(html_content: str) -> str | None: """ Extract the title from HTML content. @@ -276,7 +275,7 @@ def get_html_title(html_content: str) -> Optional[str]: return None -def get_html_meta_description(html_content: str) -> Optional[str]: +def get_html_meta_description(html_content: str) -> str | None: """ Extract the meta description from HTML content. diff --git a/py-src/data_formulator/app.py b/py-src/data_formulator/app.py index a767d277..b5f0bbed 100644 --- a/py-src/data_formulator/app.py +++ b/py-src/data_formulator/app.py @@ -2,22 +2,19 @@ # Licensed under the MIT License. import argparse -import random import sys import os import mimetypes -from functools import lru_cache mimetypes.add_type('application/javascript', '.js') mimetypes.add_type('application/javascript', '.mjs') import flask -from flask import Flask, request, send_from_directory, session +from flask import Flask, request, send_from_directory from flask import stream_with_context, Response import webbrowser import threading import numpy as np -import datetime import time import logging @@ -28,28 +25,14 @@ from dotenv import load_dotenv import secrets import base64 -APP_ROOT = Path(Path(__file__).parent).absolute() - -import os - -# blueprints -from data_formulator.tables_routes import tables_bp -from data_formulator.agent_routes import agent_bp -from data_formulator.demo_stream_routes import demo_stream_bp, limiter as demo_stream_limiter -from data_formulator.db_manager import db_manager -from data_formulator.example_datasets_config import EXAMPLE_DATASETS -import queue -from typing import Dict, Any +APP_ROOT = Path(Path(__file__).parent).absolute() +# Create Flask app (lightweight, no heavy imports yet) app = Flask(__name__, static_url_path='', static_folder=os.path.join(APP_ROOT, "dist")) -app.secret_key = secrets.token_hex(16) # Generate a random secret key for sessions +app.secret_key = secrets.token_hex(16) app.json.sort_keys = False -# Initialize rate limiter for demo stream routes that call external APIs -# The limiter is defined in demo_stream_routes.py to avoid circular imports -demo_stream_limiter.init_app(app) - class CustomJSONEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, np.int64): @@ -65,7 +48,7 @@ def default(self, obj): load_dotenv(os.path.join(APP_ROOT, 'api-keys.env')) load_dotenv(os.path.join(APP_ROOT, '.env')) -# Add this line to store args at app level +# Default config from env (can be overridden by CLI args) app.config['CLI_ARGS'] = { 'exec_python_in_subprocess': os.environ.get('EXEC_PYTHON_IN_SUBPROCESS', 'false').lower() == 'true', 'disable_display_keys': os.environ.get('DISABLE_DISPLAY_KEYS', 'false').lower() == 'true', @@ -74,19 +57,11 @@ def default(self, obj): 'project_front_page': os.environ.get('PROJECT_FRONT_PAGE', 'false').lower() == 'true' } -# register blueprints -# Only register tables blueprint if database is not disabled -if not app.config['CLI_ARGS']['disable_database']: - app.register_blueprint(tables_bp) -app.register_blueprint(agent_bp) -app.register_blueprint(demo_stream_bp) - # Get logger for this module (logging config moved to run_app function) logger = logging.getLogger(__name__) def configure_logging(): """Configure logging for the Flask application.""" - # Configure root logger for general application logging logging.basicConfig( level=logging.ERROR, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', @@ -98,14 +73,38 @@ def configure_logging(): logging.getLogger('litellm').setLevel(logging.WARNING) logging.getLogger('openai').setLevel(logging.WARNING) - # Configure Flask app logger to use the same settings app.logger.handlers = [] for handler in logging.getLogger().handlers: app.logger.addHandler(handler) +def _register_blueprints(disable_database: bool): + """ + Import and register blueprints. This is where heavy imports happen. + Called from run_app() with progress feedback. + """ + # Import tables routes (imports database connectors) + print(" Loading database connectors...", flush=True) + from data_formulator.tables_routes import tables_bp + + # Import agent routes (imports AI/ML libraries: litellm, sklearn, etc.) + print(" Loading AI agents...", flush=True) + from data_formulator.agent_routes import agent_bp + + # Import demo stream routes + from data_formulator.demo_stream_routes import demo_stream_bp, limiter as demo_stream_limiter + demo_stream_limiter.init_app(app) + + # Register blueprints + if not disable_database: + app.register_blueprint(tables_bp) + app.register_blueprint(agent_bp) + app.register_blueprint(demo_stream_bp) + + @app.route('/api/example-datasets') def get_sample_datasets(): + from data_formulator.example_datasets_config import EXAMPLE_DATASETS return flask.jsonify(EXAMPLE_DATASETS) @@ -116,116 +115,21 @@ def index_alt(path): @app.errorhandler(404) def page_not_found(e): - # your processing here logger.info(app.static_folder) - return send_from_directory(app.static_folder, "index.html") #'Hello 404!' #send_from_directory(app.static_folder, "index.html") - -###### test functions ###### - -@app.route('/api/hello') -def hello(): - values = [ - {"a": "A", "b": 28}, {"a": "B", "b": 55}, {"a": "C", "b": 43}, - {"a": "D", "b": 91}, {"a": "E", "b": 81}, {"a": "F", "b": 53}, - {"a": "G", "b": 19}, {"a": "H", "b": 87}, {"a": "I", "b": 52} - ] - spec = { - "$schema": "https://vega.github.io/schema/vega-lite/v5.json", - "description": "A simple bar chart with embedded data.", - "data": { "values": values }, - "mark": "bar", - "encoding": { - "x": {"field": "a", "type": "nominal", "axis": {"labelAngle": 0}}, - "y": {"field": "b", "type": "quantitative"} - } - } - return json.dumps(spec) - -@app.route('/api/hello-stream') -def streamed_response(): - def generate(): - values = [ - {"a": "A", "b": 28}, {"a": "B", "b": 55}, {"a": "C", "b": 43}, - {"a": "D", "b": 91}, {"a": "E", "b": 81}, {"a": "F", "b": 53}, - {"a": "G", "b": 19}, {"a": "H", "b": 87}, {"a": "I", "b": 52} - ] - spec = { - "$schema": "https://vega.github.io/schema/vega-lite/v5.json", - "description": "A simple bar chart with embedded data.", - "data": { "values": [] }, - "mark": "bar", - "encoding": { - "x": {"field": "a", "type": "nominal", "axis": {"labelAngle": 0}}, - "y": {"field": "b", "type": "quantitative"} - } - } - for i in range(3): - time.sleep(3) - spec["data"]["values"] = values[i:] - yield json.dumps(spec) - return Response(stream_with_context(generate())) + return send_from_directory(app.static_folder, "index.html") -@app.route('/api/get-session-id', methods=['GET', 'POST']) -def get_session_id(): - """Endpoint to get or confirm a session ID from the client""" - # if it is a POST request, we expect a session_id in the body - # if it is a GET request, we do not expect a session_id in the query params - - current_session_id = None - if request.is_json: - content = request.get_json() - current_session_id = content.get("session_id", None) - - # Check if database is disabled - database_disabled = app.config['CLI_ARGS']['disable_database'] - - if database_disabled: - # When database is disabled, don't use Flask sessions (cookies) - # Just return the provided session_id or generate a new one - if current_session_id is None: - current_session_id = secrets.token_hex(16) - logger.info(f"Generated session ID for disabled database: {current_session_id}") - else: - logger.info(f"Using provided session ID for disabled database: {current_session_id}") - - return flask.jsonify({ - "status": "ok", - "session_id": current_session_id - }) - else: - # When database is enabled, use Flask sessions (cookies) as before - if current_session_id is None: - if 'session_id' not in session: - session['session_id'] = secrets.token_hex(16) - session.permanent = True - logger.info(f"Created new session: {session['session_id']}") - else: - # override the session_id - session['session_id'] = current_session_id - session.permanent = True - - return flask.jsonify({ - "status": "ok", - "session_id": session['session_id'] - }) @app.route('/api/app-config', methods=['GET']) def get_app_config(): """Provide frontend configuration settings from CLI arguments""" args = app.config['CLI_ARGS'] - # When database is disabled, don't try to access session - session_id = None - if not args['disable_database']: - session_id = session.get('session_id', None) - config = { "EXEC_PYTHON_IN_SUBPROCESS": args['exec_python_in_subprocess'], "DISABLE_DISPLAY_KEYS": args['disable_display_keys'], "DISABLE_DATABASE": args['disable_database'], "DISABLE_FILE_UPLOAD": args['disable_file_upload'], "PROJECT_FRONT_PAGE": args['project_front_page'], - "SESSION_ID": session_id } return flask.jsonify(config) @@ -238,7 +142,6 @@ def database_disabled_fallback(path): "message": "Database functionality is disabled. Use --disable-database=false to enable table operations." }), 503 else: - # If database is not disabled but we're hitting this route, it means the tables blueprint wasn't registered return flask.jsonify({ "status": "error", "message": "Table routes are not available" @@ -264,12 +167,12 @@ def parse_args() -> argparse.Namespace: def run_app(): - # Configure logging only when actually running the app - configure_logging() + print("Starting Data Formulator...", flush=True) + configure_logging() args = parse_args() - # Add this line to make args available to routes - # override the args from the env file + + # Override config from CLI args app.config['CLI_ARGS'] = { 'exec_python_in_subprocess': args.exec_python_in_subprocess, 'disable_display_keys': args.disable_display_keys, @@ -278,18 +181,21 @@ def run_app(): 'project_front_page': args.project_front_page } + # Register blueprints (this is where heavy imports happen) + _register_blueprints(args.disable_database) + # Update database manager state + from data_formulator.db_manager import db_manager db_manager._disabled = args.disable_database + url = "http://localhost:{0}".format(args.port) + print(f"Ready! Open {url} in your browser.", flush=True) + if not args.dev: - url = "http://localhost:{0}".format(args.port) - threading.Timer(2, lambda: webbrowser.open(url, new=2)).start() + threading.Timer(1.5, lambda: webbrowser.open(url, new=2)).start() - # Enable debug mode and auto-reload in development mode debug_mode = args.dev app.run(host='0.0.0.0', port=args.port, debug=debug_mode, use_reloader=debug_mode) if __name__ == '__main__': - #app.run(debug=True, host='127.0.0.1', port=5000) - #use 0.0.0.0 for public run_app() diff --git a/py-src/data_formulator/auth.py b/py-src/data_formulator/auth.py new file mode 100644 index 00000000..ba54b7fb --- /dev/null +++ b/py-src/data_formulator/auth.py @@ -0,0 +1,103 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +Authentication and identity management for Data Formulator. + +This module provides a hybrid identity system that supports both anonymous +browser-based users and authenticated users (via Azure App Service or JWT). + +Security Model: +- Anonymous users: Browser UUID from X-Identity-Id header (prefixed with "browser:") +- Authenticated users: Verified identity from Azure headers or JWT (prefixed with "user:") +- Namespacing ensures authenticated user data cannot be accessed by spoofing headers +""" + +import logging +from flask import request, current_app + +logger = logging.getLogger(__name__) + + +def get_identity_id() -> str: + """ + Get identity ID with proper security priority: + + 1. Verified user from Azure App Service auth headers (trusted, set by Azure) + 2. Verified user from JWT bearer token (trusted, cryptographically verified) + 3. Browser ID from X-Identity-Id header (untrusted, for anonymous users only) + + The key insight: for anonymous users, we trust X-Identity-Id because there's + no security risk (who cares if someone "steals" a random UUID?). For authenticated + users, we MUST extract identity from verified sources, not client-provided headers. + + Identity is namespaced as "user:" or "browser:" to ensure authenticated + user data is never accessible via anonymous browser identity spoofing. + + Returns: + str: The namespaced identity ID string (e.g., "user:alice@..." or "browser:550e8400-...") + + Raises: + ValueError: If no identity could be determined + """ + + # Priority 1: Azure App Service Authentication (EasyAuth) + # When deployed to Azure with authentication enabled, Azure injects these headers. + # These are SET BY AZURE (not the client) after verifying the user's identity. + azure_principal_id = request.headers.get('X-MS-CLIENT-PRINCIPAL-ID') + if azure_principal_id: + logger.debug(f"Using Azure principal ID: {azure_principal_id[:8]}...") + return f"user:{azure_principal_id}" + + # Priority 2: JWT Bearer Token (for custom auth implementations) + # If you implement your own auth, verify the JWT here and extract user ID. + # Example (uncomment and configure when implementing JWT auth): + # + # auth_header = request.headers.get('Authorization', '') + # if auth_header.startswith('Bearer '): + # token = auth_header[7:] + # try: + # import jwt + # payload = jwt.decode(token, current_app.config['JWT_SECRET'], algorithms=['HS256']) + # user_id = payload.get('sub') or payload.get('user_id') + # if user_id: + # logger.debug(f"Using JWT user ID: {user_id[:8]}...") + # return f"user:{user_id}" + # except Exception as e: + # logger.warning(f"Invalid JWT token: {e}") + # # Fall through to browser identity + + # Priority 3: Anonymous browser identity (UNTRUSTED - from client header) + # SECURITY: We NEVER trust the namespace prefix from X-Identity-Id header. + # Even if client sends "user:alice@...", we force "browser:" prefix. + # Only verified auth (Azure headers, JWT) can result in "user:" prefix. + client_identity = request.headers.get('X-Identity-Id') + if client_identity: + # Extract the ID part, ignoring any client-provided prefix + # e.g., "browser:550e8400-..." → "550e8400-..." + # e.g., "user:alice@..." → "alice@..." (but forced to browser: namespace) + if ':' in client_identity: + # Strip the prefix - we don't trust client-provided namespaces + identity_value = client_identity.split(':', 1)[1] + else: + identity_value = client_identity + + # Always use browser: prefix for client-provided identities + return f"browser:{identity_value}" + + raise ValueError("X-Identity-Id header is required. Please refresh the page.") + + +def get_identity_id_optional() -> str | None: + """ + Get identity ID if available, returning None instead of raising an error. + + Useful for endpoints where identity is optional (e.g., when language != "sql"). + + Returns: + str | None: The namespaced identity ID string, or None if not available + """ + try: + return get_identity_id() + except ValueError: + return None diff --git a/py-src/data_formulator/data_loader/__init__.py b/py-src/data_formulator/data_loader/__init__.py index f61a6851..898c50f5 100644 --- a/py-src/data_formulator/data_loader/__init__.py +++ b/py-src/data_formulator/data_loader/__init__.py @@ -21,4 +21,15 @@ "athena": AthenaDataLoader } -__all__ = ["ExternalDataLoader", "MySQLDataLoader", "MSSQLDataLoader", "KustoDataLoader", "S3DataLoader", "AzureBlobDataLoader", "PostgreSQLDataLoader", "MongoDBDataLoader", "BigQueryDataLoader", "AthenaDataLoader", "DATA_LOADERS"] \ No newline at end of file +__all__ = [ + "ExternalDataLoader", + "MySQLDataLoader", + "MSSQLDataLoader", + "KustoDataLoader", + "S3DataLoader", + "AzureBlobDataLoader", + "PostgreSQLDataLoader", + "MongoDBDataLoader", + "BigQueryDataLoader", + "AthenaDataLoader", + "DATA_LOADERS"] \ No newline at end of file diff --git a/py-src/data_formulator/data_loader/athena_data_loader.py b/py-src/data_formulator/data_loader/athena_data_loader.py index 8ba617fe..1198315e 100644 --- a/py-src/data_formulator/data_loader/athena_data_loader.py +++ b/py-src/data_formulator/data_loader/athena_data_loader.py @@ -5,8 +5,7 @@ import duckdb from data_formulator.data_loader.external_data_loader import ExternalDataLoader, sanitize_table_name -from typing import Dict, Any, List, Optional -from data_formulator.security import validate_sql_query +from typing import Any try: import boto3 @@ -54,7 +53,7 @@ def _validate_s3_url(url: str) -> None: raise ValueError(f"Invalid S3 URL format: '{url}'. Expected format: 's3://bucket/path'") -def _escape_sql_string(value: Optional[str]) -> str: +def _escape_sql_string(value: str | None) -> str: """Escape single quotes in SQL string values.""" if value is None: return "" @@ -69,7 +68,7 @@ class AthenaDataLoader(ExternalDataLoader): """ @staticmethod - def list_params() -> List[Dict[str, Any]]: + def list_params() -> list[dict[str, Any]]: params_list = [ {"name": "aws_profile", "type": "string", "required": False, "default": "", "description": "AWS profile name from ~/.aws/credentials (if set, access key and secret are not required)"}, {"name": "aws_access_key_id", "type": "string", "required": False, "default": "", "description": "AWS access key ID (not required if using aws_profile)"}, @@ -160,7 +159,7 @@ def auth_instructions() -> str: **Security:** Never share secret keys, rotate regularly, use least privilege permissions. """ - def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): + def __init__(self, params: dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): if not BOTO3_AVAILABLE: raise ImportError( "boto3 is required for Athena connections. " @@ -398,7 +397,7 @@ def _execute_query(self, query: str) -> str: wait_time = min(2 ** (elapsed // 10), 10) time.sleep(wait_time) - def list_tables(self, table_filter: str = None) -> List[Dict[str, Any]]: + def list_tables(self, table_filter: str | None = None) -> list[dict[str, Any]]: """List tables from Athena catalog (Glue Data Catalog).""" results = [] @@ -469,7 +468,7 @@ def list_tables(self, table_filter: str = None) -> List[Dict[str, Any]]: log.info(f"Returning {len(results)} tables") return results - def ingest_data(self, table_name: str, name_as: str = None, size: int = 1000000, sort_columns: List[str] = None, sort_order: str = 'asc'): + def ingest_data(self, table_name: str, name_as: str | None = None, size: int = 1000000, sort_columns: list[str] | None = None, sort_order: str = 'asc'): """Ingest data from an Athena table by executing a SELECT query.""" # Validate table name to prevent SQL injection _validate_athena_table_name(table_name) @@ -511,49 +510,3 @@ def ingest_data(self, table_name: str, name_as: str = None, size: int = 1000000, """) log.info(f"Successfully ingested data into table '{name_as}'") - - def view_query_sample(self, query: str) -> List[Dict[str, Any]]: - """Execute query and return sample results.""" - result, error_message = validate_sql_query(query) - if not result: - raise ValueError(error_message) - - # Add LIMIT if not present to avoid large result sets - query_upper = query.upper() - if "LIMIT" not in query_upper: - query = f"{query.rstrip().rstrip(';')} LIMIT 10" - - # Execute query on Athena - result_location = self._execute_query(query) - - # Validate the result location is a proper S3 URL - _validate_s3_url(result_location) - - # Load results from S3 - df = self.duck_db_conn.execute(f"SELECT * FROM read_csv_auto('{_escape_sql_string(result_location)}')").df() - - return json.loads(df.head(10).to_json(orient="records")) - - def ingest_data_from_query(self, query: str, name_as: str): - """Execute Athena query and ingest results into DuckDB.""" - result, error_message = validate_sql_query(query) - if not result: - raise ValueError(error_message) - - name_as = sanitize_table_name(name_as) - - # Execute query on Athena - log.info(f"Executing Athena query for table '{name_as}'") - result_location = self._execute_query(query) - - # Validate the result location is a proper S3 URL - _validate_s3_url(result_location) - - # Load results from S3 into DuckDB - log.info(f"Loading query results from {result_location}") - self.duck_db_conn.execute(f""" - CREATE OR REPLACE TABLE main.{name_as} AS - SELECT * FROM read_csv_auto('{_escape_sql_string(result_location)}') - """) - - log.info(f"Successfully ingested data into table '{name_as}'") diff --git a/py-src/data_formulator/data_loader/azure_blob_data_loader.py b/py-src/data_formulator/data_loader/azure_blob_data_loader.py index 1206f4e0..83b53b4a 100644 --- a/py-src/data_formulator/data_loader/azure_blob_data_loader.py +++ b/py-src/data_formulator/data_loader/azure_blob_data_loader.py @@ -4,8 +4,7 @@ import os from data_formulator.data_loader.external_data_loader import ExternalDataLoader, sanitize_table_name -from typing import Dict, Any, List -from data_formulator.security import validate_sql_query +from typing import Any try: from azure.storage.blob import BlobServiceClient, ContainerClient @@ -17,7 +16,7 @@ class AzureBlobDataLoader(ExternalDataLoader): @staticmethod - def list_params() -> List[Dict[str, Any]]: + def list_params() -> list[dict[str, Any]]: params_list = [ {"name": "account_name", "type": "string", "required": True, "default": "", "description": "Azure storage account name"}, {"name": "container_name", "type": "string", "required": True, "default": "", "description": "Azure blob container name"}, @@ -65,7 +64,7 @@ def auth_instructions() -> str: - JSON files (.json, .jsonl) """ - def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): + def __init__(self, params: dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): if not AZURE_BLOB_AVAILABLE: raise ImportError( "Azure storage libraries are required for Azure Blob connections. " @@ -130,7 +129,7 @@ def _setup_azure_authentication(self): ) """) - def list_tables(self, table_filter: str = None) -> List[Dict[str, Any]]: + def list_tables(self, table_filter: str | None = None) -> list[dict[str, Any]]: # Use Azure SDK to list blobs in the container from azure.storage.blob import BlobServiceClient @@ -348,7 +347,7 @@ def _estimate_by_row_sampling(self, azure_url: str, file_extension: str) -> int: print(f"Error in row sampling for {azure_url}: {e}") return 0 - def ingest_data(self, table_name: str, name_as: str = None, size: int = 1000000, sort_columns: List[str] = None, sort_order: str = 'asc'): + def ingest_data(self, table_name: str, name_as: str | None = None, size: int = 1000000, sort_columns: list[str] | None = None, sort_order: str = 'asc'): if name_as is None: name_as = table_name.split('/')[-1].split('.')[0] @@ -384,21 +383,4 @@ def ingest_data(self, table_name: str, name_as: str = None, size: int = 1000000, LIMIT {size} """) else: - raise ValueError(f"Unsupported file type: {table_name}") - - def view_query_sample(self, query: str) -> List[Dict[str, Any]]: - result, error_message = validate_sql_query(query) - if not result: - raise ValueError(error_message) - - return json.loads(self.duck_db_conn.execute(query).df().head(10).to_json(orient="records")) - - def ingest_data_from_query(self, query: str, name_as: str): - # Execute the query and get results as a DataFrame - result, error_message = validate_sql_query(query) - if not result: - raise ValueError(error_message) - - df = self.duck_db_conn.execute(query).df() - # Use the base class's method to ingest the DataFrame - self.ingest_df_to_duckdb(df, sanitize_table_name(name_as)) \ No newline at end of file + raise ValueError(f"Unsupported file type: {table_name}") \ No newline at end of file diff --git a/py-src/data_formulator/data_loader/bigquery_data_loader.py b/py-src/data_formulator/data_loader/bigquery_data_loader.py index cd3c1cf6..a61d0f4b 100644 --- a/py-src/data_formulator/data_loader/bigquery_data_loader.py +++ b/py-src/data_formulator/data_loader/bigquery_data_loader.py @@ -1,12 +1,11 @@ import json import logging import re -from typing import Dict, Any, List, Optional +from typing import Any import pandas as pd import duckdb from data_formulator.data_loader.external_data_loader import ExternalDataLoader, sanitize_table_name -from data_formulator.security import validate_sql_query try: from google.cloud import bigquery @@ -21,7 +20,7 @@ class BigQueryDataLoader(ExternalDataLoader): """BigQuery data loader implementation""" @staticmethod - def list_params() -> List[Dict[str, Any]]: + def list_params() -> list[dict[str, Any]]: return [ {"name": "project_id", "type": "text", "required": True, "description": "Google Cloud Project ID", "default": ""}, {"name": "dataset_id", "type": "text", "required": False, "description": "Dataset ID(s) - leave empty for all, or specify one (e.g., 'billing') or multiple separated by commas (e.g., 'billing,enterprise_collected,ga_api')", "default": ""}, @@ -68,7 +67,7 @@ def auth_instructions() -> str: - Execute custom SQL queries """ - def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): + def __init__(self, params: dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): if not BIGQUERY_AVAILABLE: raise ImportError( "google-cloud-bigquery is required for BigQuery connections. " @@ -96,7 +95,7 @@ def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnecti location=self.location ) - def list_tables(self, table_filter: str = None) -> List[Dict[str, Any]]: + def list_tables(self, table_filter: str | None = None) -> list[dict[str, Any]]: """List tables from BigQuery datasets""" results = [] @@ -204,7 +203,7 @@ def safe_convert(x): return df - def ingest_data(self, table_name: str, name_as: Optional[str] = None, size: int = 1000000, sort_columns: List[str] = None, sort_order: str = 'asc'): + def ingest_data(self, table_name: str, name_as: str | None = None, size: int = 1000000, sort_columns: list[str] | None = None, sort_order: str = 'asc'): """Ingest data from BigQuery table into DuckDB with stable, de-duplicated column aliases.""" if name_as is None: name_as = table_name.split('.')[-1] @@ -295,38 +294,3 @@ def process_field(field, parent_path: str = ""): df = self._convert_bigquery_dtypes(df) self.ingest_df_to_duckdb(df, name_as) - - def view_query_sample(self, query: str) -> List[Dict[str, Any]]: - """Execute query and return sample results as a list of dictionaries""" - result, error_message = validate_sql_query(query) - if not result: - raise ValueError(error_message) - - # Add LIMIT if not present - if "LIMIT" not in query.upper(): - query += " LIMIT 10" - - df = self.client.query(query).to_dataframe() - return json.loads(df.to_json(orient="records")) - - def ingest_data_from_query(self, query: str, name_as: str) -> pd.DataFrame: - """Execute custom query and ingest results into DuckDB""" - name_as = sanitize_table_name(name_as) - - result, error_message = validate_sql_query(query) - if not result: - raise ValueError(error_message) - - # Execute query and get DataFrame - df = self.client.query(query).to_dataframe() - - # Drop duplicate columns - df = df.loc[:, ~df.columns.duplicated()] - - # Convert BigQuery-specific dtypes - df = self._convert_bigquery_dtypes(df) - - # Use base class method to ingest DataFrame - self.ingest_df_to_duckdb(df, name_as) - - return df diff --git a/py-src/data_formulator/data_loader/external_data_loader.py b/py-src/data_formulator/data_loader/external_data_loader.py index 41060d87..0e50c0b0 100644 --- a/py-src/data_formulator/data_loader/external_data_loader.py +++ b/py-src/data_formulator/data_loader/external_data_loader.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Dict, Any, List +from typing import Any import pandas as pd import json import duckdb @@ -76,7 +76,7 @@ def ingest_df_to_duckdb(self, df: pd.DataFrame, table_name: str): @staticmethod @abstractmethod - def list_params() -> List[Dict[str, Any]]: + def list_params() -> list[dict[str, Any]]: pass @staticmethod @@ -84,16 +84,16 @@ def list_params() -> List[Dict[str, Any]]: def auth_instructions() -> str: pass @abstractmethod - def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): + def __init__(self, params: dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): pass @abstractmethod - def list_tables(self, table_filter: str = None) -> List[Dict[str, Any]]: + def list_tables(self, table_filter: str = None) -> list[dict[str, Any]]: # should include: table_name, column_names, column_types, sample_data pass @abstractmethod - def ingest_data(self, table_name: str, name_as: str = None, size: int = 1000000, sort_columns: List[str] = None, sort_order: str = 'asc'): + def ingest_data(self, table_name: str, name_as: str = None, size: int = 1000000, sort_columns: list[str] = None, sort_order: str = 'asc'): """Ingest data from a table into DuckDB. Args: @@ -104,12 +104,3 @@ def ingest_data(self, table_name: str, name_as: str = None, size: int = 1000000, sort_order: Sort direction, 'asc' for ascending or 'desc' for descending """ pass - - @abstractmethod - def view_query_sample(self, query: str) -> List[Dict[str, Any]]: - pass - - @abstractmethod - def ingest_data_from_query(self, query: str, name_as: str): - pass - diff --git a/py-src/data_formulator/data_loader/kusto_data_loader.py b/py-src/data_formulator/data_loader/kusto_data_loader.py index 6ed6d602..a5044005 100644 --- a/py-src/data_formulator/data_loader/kusto_data_loader.py +++ b/py-src/data_formulator/data_loader/kusto_data_loader.py @@ -1,6 +1,6 @@ import logging import sys -from typing import Dict, Any, List +from typing import Any import pandas as pd import json import duckdb @@ -60,7 +60,7 @@ def auth_instructions() -> str: - kusto_database: Name of the database you want to access """ - def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): + def __init__(self, params: dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): if not KUSTO_AVAILABLE: raise ImportError( "azure-kusto-data is required for Kusto/Azure Data Explorer connections. " @@ -156,7 +156,7 @@ def query(self, kql: str) -> pd.DataFrame: return df - def list_tables(self, table_filter: str = None) -> List[Dict[str, Any]]: + def list_tables(self, table_filter: str | None = None) -> list[dict[str, Any]]: query = ".show tables" tables_df = self.query(query) @@ -197,7 +197,7 @@ def list_tables(self, table_filter: str = None) -> List[Dict[str, Any]]: return tables - def ingest_data(self, table_name: str, name_as: str = None, size: int = 5000000, sort_columns: List[str] = None, sort_order: str = 'asc') -> pd.DataFrame: + def ingest_data(self, table_name: str, name_as: str | None = None, size: int = 5000000, sort_columns: list[str] | None = None, sort_order: str = 'asc') -> pd.DataFrame: if name_as is None: name_as = table_name @@ -252,14 +252,4 @@ def ingest_data(self, table_name: str, name_as: str = None, size: int = 5000000, self.duck_db_conn.execute(f"INSERT INTO {name_as} SELECT * FROM df_temp_{random_suffix}") self.duck_db_conn.execute(f"DROP VIEW df_temp_{random_suffix}") - total_rows_ingested += len(chunk_df) - - def view_query_sample(self, query: str) -> List[Dict[str, Any]]: - df = self.query(query).head(10) - return json.loads(df.to_json(orient="records", date_format='iso')) - - def ingest_data_from_query(self, query: str, name_as: str) -> pd.DataFrame: - # Sanitize the table name for SQL compatibility - name_as = sanitize_table_name(name_as) - df = self.query(query) - self.ingest_df_to_duckdb(df, name_as) \ No newline at end of file + total_rows_ingested += len(chunk_df) \ No newline at end of file diff --git a/py-src/data_formulator/data_loader/mongodb_data_loader.py b/py-src/data_formulator/data_loader/mongodb_data_loader.py index 6b460354..55087867 100644 --- a/py-src/data_formulator/data_loader/mongodb_data_loader.py +++ b/py-src/data_formulator/data_loader/mongodb_data_loader.py @@ -9,9 +9,7 @@ from datetime import datetime from data_formulator.data_loader.external_data_loader import ExternalDataLoader, sanitize_table_name - -from data_formulator.security import validate_sql_query -from typing import Dict, Any, Optional, List +from typing import Any class MongoDBDataLoader(ExternalDataLoader): @@ -56,7 +54,7 @@ def auth_instructions() -> str: - Test connection: `mongosh --host [host] --port [port]` """ - def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): + def __init__(self, params: dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): self.params = params self.duck_db_conn = duck_db_conn @@ -113,7 +111,7 @@ def __del__(self): self.close() @staticmethod - def _flatten_document(doc: Dict[str, Any], parent_key: str = '', sep: str = '_') -> Dict[str, Any]: + def _flatten_document(doc: dict[str, Any], parent_key: str = '', sep: str = '_') -> dict[str, Any]: """ Use recursion to flatten nested MongoDB documents """ @@ -139,7 +137,7 @@ def _flatten_document(doc: Dict[str, Any], parent_key: str = '', sep: str = '_') return dict(items) @staticmethod - def _convert_special_types(doc: Dict[str, Any]) -> Dict[str, Any]: + def _convert_special_types(doc: dict[str, Any]) -> dict[str, Any]: """ Convert MongoDB special types (ObjectId, datetime, etc.) to serializable types """ @@ -165,7 +163,7 @@ def _convert_special_types(doc: Dict[str, Any]) -> Dict[str, Any]: result[key] = value return result - def _process_documents(self, documents: List[Dict[str, Any]]) -> pd.DataFrame: + def _process_documents(self, documents: list[dict[str, Any]]) -> pd.DataFrame: """ Process MongoDB documents list, flatten and convert to DataFrame """ @@ -240,7 +238,7 @@ def list_tables(self, table_filter: str = None): return results - def ingest_data(self, table_name: str, name_as: Optional[str] = None, size: int = 100000, sort_columns: List[str] = None, sort_order: str = 'asc'): + def ingest_data(self, table_name: str, name_as: str | None = None, size: int = 100000, sort_columns: list[str] | None = None, sort_order: str = 'asc'): """ Import MongoDB collection data into DuckDB """ @@ -275,52 +273,6 @@ def ingest_data(self, table_name: str, name_as: Optional[str] = None, size: int self._load_dataframe_to_duckdb(df, name_as, size) return - - - def view_query_sample(self, query: str) -> List[Dict[str, Any]]: - - self._existed_collections_in_duckdb() - self._difference_collections() - self._preload_all_collections(self.collection.name if self.collection else "") - - result, error_message = validate_sql_query(query) - if not result: - print(error_message) - raise ValueError(error_message) - - result_query = json.loads(self.duck_db_conn.execute(query).df().head(10).to_json(orient="records")) - - self._drop_all_loaded_tables() - - for collection_name, df in self.existed_collections.items(): - self._load_dataframe_to_duckdb(df, collection_name) - - return result_query - - def ingest_data_from_query(self, query: str, name_as: str) -> pd.DataFrame: - """ - Create a new table from query results - """ - result, error_message = validate_sql_query(query) - if not result: - raise ValueError(error_message) - - name_as = sanitize_table_name(name_as) - - self._existed_collections_in_duckdb() - self._difference_collections() - self._preload_all_collections(self.collection.name if self.collection else "") - - query_result_df = self.duck_db_conn.execute(query).df() - - self._drop_all_loaded_tables() - - for collection_name, existing_df in self.existed_collections.items(): - self._load_dataframe_to_duckdb(existing_df, collection_name) - - self._load_dataframe_to_duckdb(query_result_df, name_as) - - return query_result_df @staticmethod def _quote_identifier(name: str) -> str: diff --git a/py-src/data_formulator/data_loader/mssql_data_loader.py b/py-src/data_formulator/data_loader/mssql_data_loader.py index 1f18a794..44a8fdb5 100644 --- a/py-src/data_formulator/data_loader/mssql_data_loader.py +++ b/py-src/data_formulator/data_loader/mssql_data_loader.py @@ -1,6 +1,6 @@ import json import logging -from typing import Dict, Any, Optional, List +from typing import Any import duckdb import pandas as pd @@ -12,7 +12,6 @@ PYODBC_AVAILABLE = False from data_formulator.data_loader.external_data_loader import ExternalDataLoader, sanitize_table_name -from data_formulator.security import validate_sql_query log = logging.getLogger(__name__) @@ -144,7 +143,7 @@ def auth_instructions() -> str: - Windows: Install SQL Server or download ODBC driver separately """ - def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): + def __init__(self, params: dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): log.info("Initializing MSSQL DataLoader with parameters: %s", params) if not PYODBC_AVAILABLE: @@ -387,7 +386,7 @@ def list_tables(self): return results - def ingest_data(self, table_name: str, name_as: Optional[str] = None, size: int = 1000000, sort_columns: List[str] = None, sort_order: str = 'asc'): + def ingest_data(self, table_name: str, name_as: str | None = None, size: int = 1000000, sort_columns: list[str] | None = None, sort_order: str = 'asc'): """Ingest data from SQL Server table into DuckDB""" # Parse table name (assuming format: schema.table) if "." in table_name: @@ -420,46 +419,3 @@ def ingest_data(self, table_name: str, name_as: Optional[str] = None, size: int except Exception as e: log.error(f"Failed to ingest data from {table_name}: {e}") raise - - def view_query_sample(self, query: str) -> List[Dict[str, Any]]: - """Execute a custom query and return sample results""" - try: - # Add TOP 10 if not already present for SELECT queries - modified_query = query.strip() - if ( - modified_query.upper().startswith("SELECT") - and not modified_query.upper().startswith("SELECT TOP") - and "TOP " not in modified_query.upper()[:50] - ): # Check first 50 chars - modified_query = modified_query.replace("SELECT", "SELECT TOP 10", 1) - - result, error_message = validate_sql_query(modified_query) - if not result: - raise ValueError(error_message) - - df = self._execute_query(modified_query) - - # Handle NaN values for JSON serialization - df_clean = df.fillna(value=None) - return json.loads( - df_clean.head(10).to_json(orient="records", date_format="iso", default_handler=str) - ) - except Exception as e: - log.error(f"Failed to execute query sample: {e}") - raise - - def ingest_data_from_query(self, query: str, name_as: str) -> pd.DataFrame: - """Execute a custom query and ingest results into DuckDB""" - try: - result, error_message = validate_sql_query(query) - if not result: - raise ValueError(error_message) - - df = self._execute_query(query) - # Use the base class's method to ingest the DataFrame - self.ingest_df_to_duckdb(df, sanitize_table_name(name_as)) - log.info(f"Successfully ingested {len(df)} rows from custom query to {name_as}") - return df - except Exception as e: - log.error(f"Failed to execute and ingest custom query: {e}") - raise diff --git a/py-src/data_formulator/data_loader/mysql_data_loader.py b/py-src/data_formulator/data_loader/mysql_data_loader.py index 0430a57a..76ee3c15 100644 --- a/py-src/data_formulator/data_loader/mysql_data_loader.py +++ b/py-src/data_formulator/data_loader/mysql_data_loader.py @@ -5,9 +5,7 @@ import duckdb from data_formulator.data_loader.external_data_loader import ExternalDataLoader, sanitize_table_name - -from data_formulator.security import validate_sql_query -from typing import Dict, Any, Optional, List +from typing import Any try: import pymysql @@ -21,7 +19,7 @@ class MySQLDataLoader(ExternalDataLoader): @staticmethod - def list_params() -> List[Dict[str, Any]]: + def list_params() -> list[dict[str, Any]]: params_list = [ {"name": "user", "type": "string", "required": True, "default": "root", "description": ""}, {"name": "password", "type": "string", "required": False, "default": "", "description": "leave blank for no password"}, @@ -58,7 +56,7 @@ def auth_instructions() -> str: - Test connection: `mysql -u [username] -p -h [host] -P [port] [database]` """ - def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): + def __init__(self, params: dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): if not PYMYSQL_AVAILABLE: raise ImportError( "pymysql is required for MySQL connections. " @@ -154,7 +152,7 @@ def _reconnect_if_needed(self): charset='utf8mb4' ) - def list_tables(self, table_filter: str = None) -> List[Dict[str, Any]]: + def list_tables(self, table_filter: str | None = None) -> list[dict[str, Any]]: # Get list of tables from the connected database # Filter by the specific database we're connected to for better performance tables_query = """ @@ -220,7 +218,7 @@ def list_tables(self, table_filter: str = None) -> List[Dict[str, Any]]: return results - def ingest_data(self, table_name: str, name_as: Optional[str] = None, size: int = 1000000, sort_columns: List[str] = None, sort_order: str = 'asc'): + def ingest_data(self, table_name: str, name_as: str | None = None, size: int = 1000000, sort_columns: list[str] | None = None, sort_order: str = 'asc'): """Fetch data from MySQL and ingest into DuckDB.""" if name_as is None: name_as = table_name.split('.')[-1] @@ -264,28 +262,6 @@ def ingest_data(self, table_name: str, name_as: Optional[str] = None, size: int self.ingest_df_to_duckdb(df, name_as) logger.info(f"Successfully ingested {len(df)} rows from {table_name} into DuckDB table {name_as}") - def view_query_sample(self, query: str) -> List[Dict[str, Any]]: - result, error_message = validate_sql_query(query) - if not result: - raise ValueError(error_message) - - # Execute query via native MySQL connection - df = self._execute_query(query) - return json.loads(df.head(10).to_json(orient="records", date_format='iso')) - - def ingest_data_from_query(self, query: str, name_as: str) -> pd.DataFrame: - """Execute custom query and ingest results into DuckDB.""" - result, error_message = validate_sql_query(query) - if not result: - raise ValueError(error_message) - - # Execute query via native MySQL connection - df = self._execute_query(query) - - # Ingest into DuckDB using the base class method - self.ingest_df_to_duckdb(df, sanitize_table_name(name_as)) - return df - def close(self): """Explicitly close the MySQL connection.""" if hasattr(self, 'mysql_conn') and self.mysql_conn: diff --git a/py-src/data_formulator/data_loader/postgresql_data_loader.py b/py-src/data_formulator/data_loader/postgresql_data_loader.py index b327a737..21766770 100644 --- a/py-src/data_formulator/data_loader/postgresql_data_loader.py +++ b/py-src/data_formulator/data_loader/postgresql_data_loader.py @@ -4,14 +4,12 @@ import duckdb from data_formulator.data_loader.external_data_loader import ExternalDataLoader, sanitize_table_name - -from typing import Dict, Any, List, Optional -from data_formulator.security import validate_sql_query +from typing import Any class PostgreSQLDataLoader(ExternalDataLoader): @staticmethod - def list_params() -> List[Dict[str, Any]]: + def list_params() -> list[dict[str, Any]]: params_list = [ {"name": "user", "type": "string", "required": True, "default": "postgres", "description": "PostgreSQL username"}, {"name": "password", "type": "string", "required": False, "default": "", "description": "leave blank for no password"}, @@ -25,7 +23,7 @@ def list_params() -> List[Dict[str, Any]]: def auth_instructions() -> str: return "Provide your PostgreSQL connection details. The user must have SELECT permissions on the tables you want to access." - def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): + def __init__(self, params: dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): self.params = params self.duck_db_conn = duck_db_conn @@ -130,7 +128,7 @@ def list_tables(self): print(f"Error listing tables: {e}") return [] - def ingest_data(self, table_name: str, name_as: Optional[str] = None, size: int = 1000000, sort_columns: List[str] = None, sort_order: str = 'asc'): + def ingest_data(self, table_name: str, name_as: str | None = None, size: int = 1000000, sort_columns: list[str] | None = None, sort_order: str = 'asc'): # Create table in the main DuckDB database from Postgres data if name_as is None: name_as = table_name.split('.')[-1] @@ -151,21 +149,3 @@ def ingest_data(self, table_name: str, name_as: Optional[str] = None, size: int {order_by_clause} LIMIT {size} """) - - def view_query_sample(self, query: str) -> List[Dict[str, Any]]: - result, error_message = validate_sql_query(query) - if not result: - raise ValueError(error_message) - - return json.loads(self.duck_db_conn.execute(query).df().head(10).to_json(orient="records")) - - def ingest_data_from_query(self, query: str, name_as: str) -> pd.DataFrame: - # Execute the query and get results as a DataFrame - result, error_message = validate_sql_query(query) - if not result: - raise ValueError(error_message) - - df = self.duck_db_conn.execute(query).df() - # Use the base class's method to ingest the DataFrame - self.ingest_df_to_duckdb(df, sanitize_table_name(name_as)) - return df diff --git a/py-src/data_formulator/data_loader/s3_data_loader.py b/py-src/data_formulator/data_loader/s3_data_loader.py index d92b7c41..3c194d23 100644 --- a/py-src/data_formulator/data_loader/s3_data_loader.py +++ b/py-src/data_formulator/data_loader/s3_data_loader.py @@ -4,8 +4,7 @@ import os from data_formulator.data_loader.external_data_loader import ExternalDataLoader, sanitize_table_name -from typing import Dict, Any, List -from data_formulator.security import validate_sql_query +from typing import Any try: import boto3 @@ -16,7 +15,7 @@ class S3DataLoader(ExternalDataLoader): @staticmethod - def list_params() -> List[Dict[str, Any]]: + def list_params() -> list[dict[str, Any]]: params_list = [ {"name": "aws_access_key_id", "type": "string", "required": True, "default": "", "description": "AWS access key ID"}, {"name": "aws_secret_access_key", "type": "string", "required": True, "default": "", "description": "AWS secret access key"}, @@ -63,7 +62,7 @@ def auth_instructions() -> str: **Security:** Never share secret keys, rotate regularly, use least privilege permissions. """ - def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): + def __init__(self, params: dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection): if not BOTO3_AVAILABLE: raise ImportError( "boto3 is required for S3 connections. " @@ -91,7 +90,7 @@ def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnecti if self.aws_session_token: # Add this block self.duck_db_conn.execute(f"SET s3_session_token='{self.aws_session_token}'") - def list_tables(self, table_filter: str = None) -> List[Dict[str, Any]]: + def list_tables(self, table_filter: str | None = None) -> list[dict[str, Any]]: # Use boto3 to list objects in the bucket import boto3 @@ -181,7 +180,7 @@ def _estimate_row_count(self, s3_url: str) -> int: print(f"Error estimating row count for {s3_url}: {e}") return 0 - def ingest_data(self, table_name: str, name_as: str = None, size: int = 1000000, sort_columns: List[str] = None, sort_order: str = 'asc'): + def ingest_data(self, table_name: str, name_as: str | None = None, size: int = 1000000, sort_columns: list[str] | None = None, sort_order: str = 'asc'): if name_as is None: name_as = table_name.split('/')[-1].split('.')[0] @@ -217,21 +216,4 @@ def ingest_data(self, table_name: str, name_as: str = None, size: int = 1000000, LIMIT {size} """) else: - raise ValueError(f"Unsupported file type: {table_name}") - - def view_query_sample(self, query: str) -> List[Dict[str, Any]]: - result, error_message = validate_sql_query(query) - if not result: - raise ValueError(error_message) - - return json.loads(self.duck_db_conn.execute(query).df().head(10).to_json(orient="records")) - - def ingest_data_from_query(self, query: str, name_as: str): - # Execute the query and get results as a DataFrame - result, error_message = validate_sql_query(query) - if not result: - raise ValueError(error_message) - - df = self.duck_db_conn.execute(query).df() - # Use the base class's method to ingest the DataFrame - self.ingest_df_to_duckdb(df, sanitize_table_name(name_as)) \ No newline at end of file + raise ValueError(f"Unsupported file type: {table_name}") \ No newline at end of file diff --git a/py-src/data_formulator/db_manager.py b/py-src/data_formulator/db_manager.py index 66bf2c8c..5212dbaa 100644 --- a/py-src/data_formulator/db_manager.py +++ b/py-src/data_formulator/db_manager.py @@ -3,7 +3,6 @@ import duckdb import pandas as pd -from typing import Dict import tempfile import os from contextlib import contextmanager @@ -15,7 +14,7 @@ class DuckDBManager: def __init__(self, local_db_dir: str, disabled: bool = False): # Store session db file paths - self._db_files: Dict[str, str] = {} + self._db_files: dict[str, str] = {} self._local_db_dir: str = local_db_dir self._disabled: bool = disabled diff --git a/py-src/data_formulator/demo_stream_routes.py b/py-src/data_formulator/demo_stream_routes.py index fd1229a4..42bb96e5 100644 --- a/py-src/data_formulator/demo_stream_routes.py +++ b/py-src/data_formulator/demo_stream_routes.py @@ -30,7 +30,7 @@ import math from datetime import datetime, timedelta from flask import Blueprint, Response, request, jsonify -from typing import List, Dict, Any, Optional +from typing import Any from collections import deque import threading @@ -107,9 +107,9 @@ def make_csv_response(rows: list, filename: str = "data.csv") -> Response: # Thread-safe storage for ISS position history _iss_track_lock = threading.Lock() _iss_track_history: deque = deque(maxlen=10000) # Keep last 10000 positions (~20000 min at 5s intervals) -_iss_last_fetch: Optional[datetime] = None +_iss_last_fetch: datetime | None = None -def _fetch_iss_position() -> Optional[Dict[str, Any]]: +def _fetch_iss_position() -> dict[str, Any] | None: """Fetch current ISS position from API""" try: response = requests.get("http://api.open-notify.org/iss-now.json", timeout=10) @@ -1074,7 +1074,7 @@ def get_yfinance_financials(): # Thread-safe storage for sales transaction history _sales_lock = threading.Lock() _sales_history: deque = deque(maxlen=1000) # Keep last 1000 transactions -_sales_last_update: Optional[datetime] = None +_sales_last_update: datetime | None = None # Products with realistic pricing and popularity _SALES_PRODUCTS = [ @@ -1097,7 +1097,7 @@ def get_yfinance_financials(): _SALES_CHANNEL_WEIGHTS = [0.40, 0.35, 0.15, 0.10] -def _generate_sale_transaction(timestamp: datetime) -> Dict[str, Any]: +def _generate_sale_transaction(timestamp: datetime) -> dict[str, Any]: """Generate a single sale transaction""" product = random.choices(_SALES_PRODUCTS, weights=[p["popularity"] for p in _SALES_PRODUCTS])[0] region = random.choices(_SALES_REGIONS, weights=_SALES_REGION_WEIGHTS)[0] diff --git a/py-src/data_formulator/security/__init__.py b/py-src/data_formulator/security/__init__.py deleted file mode 100644 index a536b8d0..00000000 --- a/py-src/data_formulator/security/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -from .query_validator import validate_sql_query - -__all__ = [ 'validate_sql_query'] \ No newline at end of file diff --git a/py-src/data_formulator/security/query_validator.py b/py-src/data_formulator/security/query_validator.py deleted file mode 100644 index 8aa03db9..00000000 --- a/py-src/data_formulator/security/query_validator.py +++ /dev/null @@ -1,166 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -import re -import logging -from typing import Tuple, Dict, Any - -logger = logging.getLogger(__name__) - - -class QueryValidationError(Exception): - """Custom exception for query validation failures""" - pass - - -def normalize_query(query: str) -> str: - """ - Normalize query for case-insensitive matching - """ - query_normalized = re.sub(r'--.*$', '', query, flags=re.MULTILINE) # Single line comments - query_normalized = re.sub(r'/\*.*?\*/', '', query_normalized, flags=re.DOTALL) # Multi-line comments - return query_normalized.strip().lower() - -def validate_sql_query(query: str) -> Tuple[bool, str]: - """ - Simple regex-based SQL query validation for dangerous operations. - - Args: - query: SQL query string to validate - - Returns: - Tuple of (is_valid, error_message) - """ - try: - # Normalize query for case-insensitive matching - query_normalized = normalize_query(query) - - # Remove SQL comments - - - # Define dangerous patterns as regex patterns - dangerous_patterns = { - # File read operations - 'file_read_operations': [ - r'\bread_csv_auto\b', r'\bread_csv\b', r'\bread_json\b', r'\bread_parquet\b', - r'\bread_ndjson\b', r'\bread_delim\b', r'\bread_fwf\b', r'\bread_excel\b', - r'\bread_sql\b', r'\bread_table\b', r'\bread_html\b', r'\bread_xml\b', - r'\bread_feather\b', r'\bread_hdf\b', r'\bread_stata\b', r'\bread_sas\b', - r'\bread_spss\b', r'\bread_rdata\b', r'\bread_rds\b' - ], - - # File write operations - 'file_write_operations': [ - r'\bwrite_csv\b', r'\bwrite_json\b', r'\bwrite_parquet\b', r'\bwrite_excel\b', - r'\bwrite_sql\b', r'\bwrite_table\b', r'\bwrite_html\b', r'\bwrite_xml\b', - r'\bwrite_feather\b', r'\bwrite_hdf\b', r'\bwrite_stata\b', r'\bwrite_sas\b', - r'\bwrite_spss\b', r'\bwrite_rdata\b', r'\bwrite_rds\b' - ], - - # File system operations - 'file_system_operations': [ - r'\bglob\b', r'\bcopy\b', r'\bmove\b', r'\brename\b', r'\bdelete\b', - r'\bremove\b', r'\bunlink\b', r'\bmkdir\b', r'\bmakedirs\b', r'\brmdir\b', - r'\bremovedirs\b', r'\bchmod\b', r'\bchown\b', r'\bsymlink\b', r'\blink\b', - r'\btouch\b', r'\btruncate\b', r'\bwrite\b', r'\bappend\b' - ], - - # System operations - 'system_operations': [ - r'\bsystem\b', r'\bexec\b', r'\beval\b', r'\bcompile\b', r'\bexecfile\b', - r'\binput\b', r'\bos\.system\b', r'\bos\.popen\b', r'\bos\.spawn\b', - r'\bos\.fork\b', r'\bos\.kill\b', r'\bsubprocess\b', r'\bsubprocess\.call\b', - r'\bsubprocess\.run\b', r'\bsubprocess\.popen\b', r'\bsubprocess\.check_call\b', - r'\bsubprocess\.check_output\b' - ], - - # Network operations - 'network_operations': [ - r'\burllib\b', r'\brequests\b', r'\bhttp://\b', r'\bhttps://\b', r'\bftp://\b', - r'\bsmtp\b', r'\bpop3\b', r'\bsocket\b', r'\btelnet\b', r'\bssh\b', r'\bscp\b', - r'\bwget\b', r'\bcurl\b' - ], - - # Shell operations - 'shell_operations': [ - r'\bshell\b', r'\bcmd\b', r'\bbash\b', r'\bsh\b', r'\bpowershell\b', - r'\bcmd\.exe\b', r'\bcommand\b', r'\bexecute\b', r'\brun\b', r'\bcall\b', - r'\binvoke\b' - ], - - # DuckDB dangerous operations - 'duckdb_dangerous_operations': [ - r'\binstall\b', r'\bload\b', r'\bunload\b', r'\bexport\b', r'\bimport\b', - r'\bcopy_to\b' - ], - - # SQL injection patterns - 'sql_injection_patterns': [ - r';\s*--', # Comment after semicolon - r';\s*/\*', # Block comment after semicolon - r'\bunion\s+all\s+select\b', # UNION ALL SELECT - r'\bunion\s+select\b', # UNION SELECT - r'\bxp_cmdshell\b', # SQL Server command shell - r'\bsp_executesql\b', # SQL Server dynamic SQL - ], - - # Dangerous SQL keywords - 'dangerous_sql_keywords': [ - r'\binsert\b', r'\bupdate\b', r'\bdelete\b', r'\bdrop\b', r'\bcreate\b', - r'\balter\b', r'\btruncate\b', r'\bgrant\b', r'\brevoke\b', r'\bexecute\b', - r'\bexec\b', r'\bcall\b', r'\bbegin\b', r'\bcommit\b', r'\brollback\b' - ], - - # File path patterns - 'file_path_patterns': [ - r'file://', r'file:///', r'c:\\', r'd:\\', r'e:\\', - r'/etc/', r'/var/', r'/tmp/', r'/home/', r'/root/', - r'/usr/', r'/bin/', r'/sbin/', r'http://', r'https://', - r'ftp://', r'sftp://', r'ssh://' - ] - } - - # Check each category of dangerous patterns - for category, patterns in dangerous_patterns.items(): - for pattern in patterns: - if re.search(pattern, query_normalized, re.IGNORECASE): - return False, f"Dangerous {category.replace('_', ' ')} detected: {pattern}" - - # Check for file paths in string literals - string_literals = re.findall(r"'([^']*)'", query_normalized) + re.findall(r'"([^"]*)"', query_normalized) - for literal in string_literals: - for pattern in dangerous_patterns['file_path_patterns']: - if re.search(pattern, literal, re.IGNORECASE): - return False, f"Dangerous file path detected in string literal: {literal}" - - return True, "Query validation passed" - - except Exception as e: - logger.error(f"Error during query validation: {e}") - return False, f"Query validation error: {str(e)}" - - -def validate_sql_query_strict(query: str) -> Tuple[bool, str]: - """ - Strict validation that only allows SELECT queries and basic operations. - - Args: - query: SQL query string to validate - - Returns: - Tuple of (is_valid, error_message) - """ - try: - # Normalize query - query_normalized = normalize_query(query) - - # Check if it's a SELECT query - if not query_normalized.startswith('select'): - return False, "Only SELECT queries are allowed" - - # Perform regular validation - return validate_sql_query(query) - - except Exception as e: - return False, f"Strict validation error: {str(e)}" - diff --git a/py-src/data_formulator/tables_routes.py b/py-src/data_formulator/tables_routes.py index 80bdeba9..d5496d50 100644 --- a/py-src/data_formulator/tables_routes.py +++ b/py-src/data_formulator/tables_routes.py @@ -9,7 +9,7 @@ mimetypes.add_type('application/javascript', '.mjs') import json import traceback -from flask import request, send_from_directory, session, jsonify, Blueprint +from flask import request, send_from_directory, jsonify, Blueprint import pandas as pd import random import string @@ -18,9 +18,9 @@ from data_formulator.db_manager import db_manager from data_formulator.data_loader import DATA_LOADERS +from data_formulator.auth import get_identity_id import re -from typing import Tuple # Get logger for this module (logging config done in app.py) logger = logging.getLogger(__name__) @@ -35,7 +35,7 @@ def list_tables(): """List all tables in the current session""" try: result = [] - with db_manager.connection(session['session_id']) as db: + with db_manager.connection(get_identity_id()) as db: table_metadata_list = db.execute(""" SELECT database_name, schema_name, table_name, schema_name==current_schema() as is_current_schema, 'table' as object_type FROM duckdb_tables() @@ -175,7 +175,7 @@ def sample_table(): total_row_count = 0 # Validate field names against table columns to prevent SQL injection - with db_manager.connection(session['session_id']) as db: + with db_manager.connection(get_identity_id()) as db: # Get valid column names columns = [col[0] for col in db.execute(f"DESCRIBE {table_id}").fetchall()] @@ -236,7 +236,7 @@ def sample_table(): def get_table_data(): """Get data from a specific table""" try: - with db_manager.connection(session['session_id']) as db: + with db_manager.connection(get_identity_id()) as db: table_name = request.args.get('table_name') # Get pagination parameters @@ -317,7 +317,7 @@ def create_table(): sanitized_table_name = sanitize_table_name(table_name) - with db_manager.connection(session['session_id']) as db: + with db_manager.connection(get_identity_id()) as db: # Check if table exists and generate unique name if needed base_name = sanitized_table_name counter = 1 @@ -364,7 +364,7 @@ def drop_table(): if not table_name: return jsonify({"status": "error", "message": "No table name provided"}), 400 - with db_manager.connection(session['session_id']) as db: + with db_manager.connection(get_identity_id()) as db: # First check if it exists as a view view_exists = db.execute(f"SELECT view_name FROM duckdb_views() WHERE view_name = '{table_name}'").fetchone() is not None if view_exists: @@ -406,18 +406,14 @@ def upload_db_file(): if not file.filename.endswith('.db'): return jsonify({"status": "error", "message": "Invalid file format. Only .db files are supported"}), 400 - # Get the session ID - if 'session_id' not in session: - return jsonify({"status": "error", "message": "No session ID found"}), 400 - - session_id = session['session_id'] + identity_id = get_identity_id() # Create temp directory if it doesn't exist temp_dir = os.path.join(tempfile.gettempdir()) os.makedirs(temp_dir, exist_ok=True) # Save the file temporarily to verify it - temp_db_path = os.path.join(temp_dir, f"temp_{session_id}.db") + temp_db_path = os.path.join(temp_dir, f"temp_{identity_id}.db") file.save(temp_db_path) # Verify if it's a valid DuckDB file @@ -430,11 +426,11 @@ def upload_db_file(): conn.close() # If we get here, the file is valid - move it to final location - db_file_path = os.path.join(temp_dir, f"df_{session_id}.db") + db_file_path = os.path.join(temp_dir, f"df_{identity_id}.db") os.replace(temp_db_path, db_file_path) # Update the db_manager's file mapping - db_manager._db_files[session_id] = db_file_path + db_manager._db_files[identity_id] = db_file_path except Exception as db_error: # Clean up temp file @@ -449,7 +445,7 @@ def upload_db_file(): return jsonify({ "status": "success", "message": "Database file uploaded successfully", - "session_id": session_id + "identity_id": identity_id }) except Exception as e: @@ -465,23 +461,16 @@ def upload_db_file(): def download_db_file(): """Download the db file for a session""" try: - # Check if session exists - if 'session_id' not in session: - return jsonify({ - "status": "error", - "message": "No session ID found" - }), 400 - - session_id = session['session_id'] + identity_id = get_identity_id() # Get the database file path from db_manager - if session_id not in db_manager._db_files: + if identity_id not in db_manager._db_files: return jsonify({ "status": "error", - "message": "No database file found for this session" + "message": "No database file found for this identity" }), 404 - db_file_path = db_manager._db_files[session_id] + db_file_path = db_manager._db_files[identity_id] # Check if file exists if not os.path.exists(db_file_path): @@ -491,7 +480,7 @@ def download_db_file(): }), 404 # Generate a filename for download - download_name = f"data_formulator_{session_id}.db" + download_name = f"data_formulator_{identity_id}.db" # Return the file as an attachment return send_from_directory( @@ -515,34 +504,28 @@ def download_db_file(): def reset_db_file(): """Reset the db file for a session""" try: - if 'session_id' not in session: - return jsonify({ - "status": "error", - "message": "No session ID found" - }), 400 - - session_id = session['session_id'] + identity_id = get_identity_id() - logger.info(f"session_id: {session_id}") + logger.info(f"identity_id: {identity_id}") # First check if there's a reference in db_manager - if session_id in db_manager._db_files: - db_file_path = db_manager._db_files[session_id] + if identity_id in db_manager._db_files: + db_file_path = db_manager._db_files[identity_id] # Remove the file if it exists if db_file_path and os.path.exists(db_file_path): os.remove(db_file_path) # Clear the reference - db_manager._db_files[session_id] = None + db_manager._db_files[identity_id] = None # Also check for any temporary files - temp_db_path = os.path.join(tempfile.gettempdir(), f"temp_{session_id}.db") + temp_db_path = os.path.join(tempfile.gettempdir(), f"temp_{identity_id}.db") if os.path.exists(temp_db_path): os.remove(temp_db_path) # Check for the main db file - main_db_path = os.path.join(tempfile.gettempdir(), f"df_{session_id}.db") + main_db_path = os.path.join(tempfile.gettempdir(), f"df_{identity_id}.db") if os.path.exists(main_db_path): os.remove(main_db_path) @@ -570,7 +553,7 @@ def analyze_table(): if not table_name: return jsonify({"status": "error", "message": "No table name provided"}), 400 - with db_manager.connection(session['session_id']) as db: + with db_manager.connection(get_identity_id()) as db: # Get column information columns = db.execute(f"DESCRIBE {table_name}").fetchall() @@ -662,7 +645,7 @@ def sanitize_table_name(table_name: str) -> str: return f'table_{uuid.uuid4()}' return sanitized_table_name -def sanitize_db_error_message(error: Exception) -> Tuple[str, int]: +def sanitize_db_error_message(error: Exception) -> tuple[str, int]: """ Sanitize error messages before sending to client. Returns a tuple of (sanitized_message, status_code) @@ -687,7 +670,7 @@ def sanitize_db_error_message(error: Exception) -> Tuple[str, int]: # Data loader errors r"Entity ID": (error_msg, 500), - r"session_id": ("session_id not found, please refresh the page", 500), + r"identity": ("Identity not found, please refresh the page", 500), } # Check if error matches any safe pattern @@ -738,7 +721,7 @@ def data_loader_list_tables(): if data_loader_type not in DATA_LOADERS: return jsonify({"status": "error", "message": f"Invalid data loader type. Must be one of: {', '.join(DATA_LOADERS.keys())}"}), 400 - with db_manager.connection(session['session_id']) as duck_db_conn: + with db_manager.connection(get_identity_id()) as duck_db_conn: data_loader = DATA_LOADERS[data_loader_type](data_loader_params, duck_db_conn) # Pass table_filter to list_tables if the data loader supports it @@ -932,7 +915,7 @@ def data_loader_ingest_data(): if data_loader_type not in DATA_LOADERS: return jsonify({"status": "error", "message": f"Invalid data loader type. Must be one of: {', '.join(DATA_LOADERS.keys())}"}), 400 - with db_manager.connection(session['session_id']) as duck_db_conn: + with db_manager.connection(get_identity_id()) as duck_db_conn: data_loader = DATA_LOADERS[data_loader_type](data_loader_params, duck_db_conn) data_loader.ingest_data(table_name, size=row_limit, sort_columns=sort_columns, sort_order=sort_order) @@ -976,7 +959,7 @@ def data_loader_view_query_sample(): if data_loader_type not in DATA_LOADERS: return jsonify({"status": "error", "message": f"Invalid data loader type. Must be one of: {', '.join(DATA_LOADERS.keys())}"}), 400 - with db_manager.connection(session['session_id']) as duck_db_conn: + with db_manager.connection(get_identity_id()) as duck_db_conn: data_loader = DATA_LOADERS[data_loader_type](data_loader_params, duck_db_conn) sample = data_loader.view_query_sample(query) @@ -1009,7 +992,7 @@ def data_loader_ingest_data_from_query(): if data_loader_type not in DATA_LOADERS: return jsonify({"status": "error", "message": f"Invalid data loader type. Must be one of: {', '.join(DATA_LOADERS.keys())}"}), 400 - with db_manager.connection(session['session_id']) as duck_db_conn: + with db_manager.connection(get_identity_id()) as duck_db_conn: data_loader = DATA_LOADERS[data_loader_type](data_loader_params, duck_db_conn) data_loader.ingest_data_from_query(query, name_as) @@ -1053,7 +1036,7 @@ def data_loader_refresh_table(): if not table_name: return jsonify({"status": "error", "message": "table_name is required"}), 400 - with db_manager.connection(session['session_id']) as duck_db_conn: + with db_manager.connection(get_identity_id()) as duck_db_conn: # Get stored metadata metadata = get_table_metadata(duck_db_conn, table_name) @@ -1132,7 +1115,7 @@ def data_loader_get_table_metadata(): if not table_name: return jsonify({"status": "error", "message": "table_name is required"}), 400 - with db_manager.connection(session['session_id']) as duck_db_conn: + with db_manager.connection(get_identity_id()) as duck_db_conn: metadata = get_table_metadata(duck_db_conn, table_name) if metadata: @@ -1160,7 +1143,7 @@ def data_loader_get_table_metadata(): def data_loader_list_table_metadata(): """Get source metadata for all tables""" try: - with db_manager.connection(session['session_id']) as duck_db_conn: + with db_manager.connection(get_identity_id()) as duck_db_conn: metadata_list = get_all_table_metadata(duck_db_conn) return jsonify({ diff --git a/py-src/data_formulator/workflows/create_vl_plots.py b/py-src/data_formulator/workflows/create_vl_plots.py index 41776fec..74b38006 100644 --- a/py-src/data_formulator/workflows/create_vl_plots.py +++ b/py-src/data_formulator/workflows/create_vl_plots.py @@ -1,6 +1,6 @@ import pandas as pd import numpy as np -from typing import Dict, List, Any, Optional +from typing import Any import vl_convert as vlc import base64 @@ -68,7 +68,7 @@ def detect_field_type(series: pd.Series) -> str: ] -def get_chart_template(chart_type: str) -> Optional[Dict]: +def get_chart_template(chart_type: str) -> dict | None: """ Find a chart template by chart type name. """ @@ -77,7 +77,7 @@ def get_chart_template(chart_type: str) -> Optional[Dict]: return template return None -def create_chart_spec(df: pd.DataFrame, fields: List[str], chart_type: str) -> Dict[str, Dict[str, str]]: +def create_chart_spec(df: pd.DataFrame, fields: list[str], chart_type: str) -> dict[str, dict[str, str]]: """ Assign fields to appropriate visualization channels based on their data types and chart type. """ @@ -85,7 +85,7 @@ def create_chart_spec(df: pd.DataFrame, fields: List[str], chart_type: str) -> D return assemble_vegailte_chart(df, chart_type, encodings) -def fields_to_encodings(df, chart_type: str, fields: List[str]) -> Dict[str, Dict[str, str]]: +def fields_to_encodings(df, chart_type: str, fields: list[str]) -> dict[str, dict[str, str]]: """ Assign fields to appropriate visualization channels based on their data types and chart type. @@ -389,9 +389,9 @@ def assign_faceting_channels(): def assemble_vegailte_chart( df: pd.DataFrame, chart_type: str, - encodings: Dict[str, Dict[str, str]], + encodings: dict[str, dict[str, str]], max_nominal_values: int = 68 -) -> Dict: +) -> dict: """ Assemble a Vega-Lite chart specification from a dataframe, chart type, and encodings. @@ -574,7 +574,7 @@ def _get_top_values(df: pd.DataFrame, field_name: str, unique_values: list, return unique_values[:max_values] -def vl_spec_to_png(spec: Dict, output_path: str = None, scale: float = 1.0) -> bytes: +def vl_spec_to_png(spec: dict, output_path: str | None = None, scale: float = 1.0) -> bytes: """ Convert a Vega-Lite specification to a PNG image. @@ -600,7 +600,7 @@ def vl_spec_to_png(spec: Dict, output_path: str = None, scale: float = 1.0) -> b return png_data -def spec_to_base64(spec: Dict, scale: float = 1.0) -> str: +def spec_to_base64(spec: dict, scale: float = 1.0) -> str: """ Convert a Vega-Lite specification to a base64 encoded PNG string. diff --git a/py-src/data_formulator/workflows/exploration_flow.py b/py-src/data_formulator/workflows/exploration_flow.py index dc241a8b..578e06fc 100644 --- a/py-src/data_formulator/workflows/exploration_flow.py +++ b/py-src/data_formulator/workflows/exploration_flow.py @@ -1,11 +1,9 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -import json import logging -from this import d import pandas as pd -from typing import Dict, List, Any, Optional, Tuple, Generator +from typing import Any, Generator from data_formulator.agents.agent_exploration import ExplorationAgent from data_formulator.agents.agent_py_data_rec import PythonDataRecAgent @@ -13,14 +11,13 @@ from data_formulator.agents.client_utils import Client from data_formulator.db_manager import db_manager from data_formulator.workflows.create_vl_plots import assemble_vegailte_chart, spec_to_base64, detect_field_type -from data_formulator.agents.agent_utils import extract_json_objects logger = logging.getLogger(__name__) def create_chart_spec_from_data( - transformed_data: Dict[str, Any], + transformed_data: dict[str, Any], chart_type: str, - chart_encodings: Dict[str, str] + chart_encodings: dict[str, str] ) -> str: """ Create a chart from transformed data using Vega-Lite. @@ -59,17 +56,17 @@ def create_chart_spec_from_data( return None def run_exploration_flow_streaming( - model_config: Dict[str, str], - input_tables: List[Dict[str, Any]], - initial_plan: List[str], + model_config: dict[str, str], + input_tables: list[dict[str, Any]], + initial_plan: list[str], language: str = "python", - session_id: Optional[str] = None, + session_id: str | None = None, exec_python_in_subprocess: bool = False, max_iterations: int = 5, max_repair_attempts: int = 1, agent_exploration_rules: str = "", agent_coding_rules: str = "" -) -> Generator[Dict[str, Any], None, None]: +) -> Generator[dict[str, Any], None, None]: """ Run the complete exploration flow from high-level question to final insights as a streaming generator. diff --git a/pyproject.toml b/pyproject.toml index 49ed802a..c68e2c75 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta" name = "data_formulator" version = "0.6" -requires-python = ">=3.9" +requires-python = ">=3.11" authors = [ {name = "Chenglong Wang", email = "chenglong.wang@microsoft.com"}, {name = "Dan Marshall", email = "danmar@microsoft.com"}, @@ -62,3 +62,8 @@ include-package-data = true [project.scripts] data_formulator = "data_formulator:run_app" + +[tool.uv] +dev-dependencies = [ + "build", +] diff --git a/requirements.txt b/requirements.txt index 0fe4db15..aa8d271c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,33 +1,646 @@ -# Core dependencies (always required) -jupyter -pandas -numpy -flask -flask-cors -flask-limiter -openai -python-dotenv -vega_datasets -litellm -duckdb -vl-convert-python -backoff -beautifulsoup4 -scikit-learn -yfinance # for demo stream routes - -# External data loaders (Azure, BigQuery, AWS S3, MySQL, MSSQL) -azure-identity -azure-kusto-data -azure-keyvault-secrets -azure-storage-blob -google-cloud-bigquery -google-auth -db-dtypes -boto3 -pymysql -pyodbc -pymongo +# This file was autogenerated by uv via the following command: +# uv export --frozen --no-hashes +aiohappyeyeballs==2.6.1 + # via aiohttp +aiohttp==3.13.3 + # via litellm +aiosignal==1.4.0 + # via aiohttp +annotated-types==0.7.0 + # via pydantic +anyio==4.12.1 + # via + # httpx + # jupyter-server + # openai +appnope==0.1.4 ; sys_platform == 'darwin' + # via ipykernel +argon2-cffi==25.1.0 + # via jupyter-server +argon2-cffi-bindings==25.1.0 + # via argon2-cffi +arrow==1.4.0 + # via isoduration +asttokens==3.0.1 + # via stack-data +async-lru==2.1.0 + # via jupyterlab +attrs==25.4.0 + # via + # aiohttp + # jsonschema + # referencing +azure-core==1.38.0 + # via + # azure-identity + # azure-keyvault-secrets + # azure-kusto-data + # azure-storage-blob +azure-identity==1.25.1 + # via + # azure-kusto-data + # data-formulator +azure-keyvault-secrets==4.10.0 + # via data-formulator +azure-kusto-data==6.0.1 + # via data-formulator +azure-storage-blob==12.28.0 + # via data-formulator +babel==2.17.0 + # via jupyterlab-server +backoff==2.2.1 + # via data-formulator +beautifulsoup4==4.14.3 + # via + # data-formulator + # nbconvert + # yfinance +bleach==6.3.0 + # via nbconvert +blinker==1.9.0 + # via flask +boto3==1.42.39 + # via data-formulator +botocore==1.42.39 + # via + # boto3 + # s3transfer +build==1.4.0 +certifi==2026.1.4 + # via + # curl-cffi + # httpcore + # httpx + # requests +cffi==2.0.0 + # via + # argon2-cffi-bindings + # cryptography + # curl-cffi + # pyzmq +charset-normalizer==3.4.4 + # via requests +click==8.3.1 + # via + # flask + # litellm + # typer-slim +colorama==0.4.6 ; os_name == 'nt' or sys_platform == 'win32' + # via + # build + # click + # ipython + # tqdm +comm==0.2.3 + # via + # ipykernel + # ipywidgets +cryptography==46.0.4 + # via + # azure-identity + # azure-storage-blob + # google-auth + # msal + # pyjwt +curl-cffi==0.13.0 + # via yfinance +db-dtypes==1.5.0 + # via data-formulator +debugpy==1.8.20 + # via ipykernel +decorator==5.2.1 + # via ipython +defusedxml==0.7.1 + # via nbconvert +deprecated==1.3.1 + # via limits +distro==1.9.0 + # via openai +dnspython==2.8.0 + # via pymongo +duckdb==1.4.4 + # via data-formulator +executing==2.2.1 + # via stack-data +fastjsonschema==2.21.2 + # via nbformat +fastuuid==0.14.0 + # via litellm +filelock==3.20.3 + # via huggingface-hub +flask==3.1.2 + # via + # data-formulator + # flask-cors + # flask-limiter +flask-cors==6.0.2 + # via data-formulator +flask-limiter==4.1.1 + # via data-formulator +fqdn==1.5.1 + # via jsonschema +frozendict==2.4.7 + # via yfinance +frozenlist==1.8.0 + # via + # aiohttp + # aiosignal +fsspec==2026.1.0 + # via huggingface-hub +google-api-core==2.29.0 + # via + # google-cloud-bigquery + # google-cloud-core +google-auth==2.48.0 + # via + # data-formulator + # google-api-core + # google-cloud-bigquery + # google-cloud-core +google-cloud-bigquery==3.40.0 + # via data-formulator +google-cloud-core==2.5.0 + # via google-cloud-bigquery +google-crc32c==1.8.0 + # via google-resumable-media +google-resumable-media==2.8.0 + # via google-cloud-bigquery +googleapis-common-protos==1.72.0 + # via + # google-api-core + # grpcio-status +grpcio==1.76.0 + # via + # google-api-core + # grpcio-status +grpcio-status==1.76.0 + # via google-api-core +h11==0.16.0 + # via httpcore +hf-xet==1.2.0 ; platform_machine == 'AMD64' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64' + # via huggingface-hub +httpcore==1.0.9 + # via httpx +httpx==0.28.1 + # via + # huggingface-hub + # jupyterlab + # litellm + # openai +huggingface-hub==1.3.5 + # via tokenizers +idna==3.11 + # via + # anyio + # httpx + # jsonschema + # requests + # yarl +ijson==3.4.0.post0 + # via azure-kusto-data +importlib-metadata==8.7.1 + # via litellm +ipykernel==7.1.0 + # via + # jupyter + # jupyter-console + # jupyterlab +ipython==8.38.0 + # via + # ipykernel + # ipywidgets + # jupyter-console +ipywidgets==8.1.8 + # via jupyter +isodate==0.7.2 + # via + # azure-keyvault-secrets + # azure-storage-blob +isoduration==20.11.0 + # via jsonschema +itsdangerous==2.2.0 + # via flask +jedi==0.19.2 + # via ipython +jinja2==3.1.6 + # via + # flask + # jupyter-server + # jupyterlab + # jupyterlab-server + # litellm + # nbconvert +jiter==0.12.0 + # via openai +jmespath==1.1.0 + # via + # boto3 + # botocore +joblib==1.5.3 + # via scikit-learn +json5==0.13.0 + # via jupyterlab-server +jsonpointer==3.0.0 + # via jsonschema +jsonschema==4.26.0 + # via + # jupyter-events + # jupyterlab-server + # litellm + # nbformat +jsonschema-specifications==2025.9.1 + # via jsonschema +jupyter==1.1.1 + # via data-formulator +jupyter-client==8.8.0 + # via + # ipykernel + # jupyter-console + # jupyter-server + # nbclient +jupyter-console==6.6.3 + # via jupyter +jupyter-core==5.9.1 + # via + # ipykernel + # jupyter-client + # jupyter-console + # jupyter-server + # jupyterlab + # nbclient + # nbconvert + # nbformat +jupyter-events==0.12.0 + # via jupyter-server +jupyter-lsp==2.3.0 + # via jupyterlab +jupyter-server==2.17.0 + # via + # jupyter-lsp + # jupyterlab + # jupyterlab-server + # notebook + # notebook-shim +jupyter-server-terminals==0.5.4 + # via jupyter-server +jupyterlab==4.5.3 + # via + # jupyter + # notebook +jupyterlab-pygments==0.3.0 + # via nbconvert +jupyterlab-server==2.28.0 + # via + # jupyterlab + # notebook +jupyterlab-widgets==3.0.16 + # via ipywidgets +lark==1.3.1 + # via rfc3987-syntax +limits==5.6.0 + # via flask-limiter +litellm==1.81.5 + # via data-formulator +markupsafe==3.0.3 + # via + # flask + # jinja2 + # nbconvert + # werkzeug +matplotlib-inline==0.2.1 + # via + # ipykernel + # ipython +mistune==3.2.0 + # via nbconvert +msal==1.34.0 + # via + # azure-identity + # azure-kusto-data + # msal-extensions +msal-extensions==1.3.1 + # via azure-identity +multidict==6.7.1 + # via + # aiohttp + # yarl +multitasking==0.0.12 + # via yfinance +nbclient==0.10.4 + # via nbconvert +nbconvert==7.17.0 + # via + # jupyter + # jupyter-server +nbformat==5.10.4 + # via + # jupyter-server + # nbclient + # nbconvert +nest-asyncio==1.6.0 + # via ipykernel +notebook==7.5.3 + # via jupyter +notebook-shim==0.2.4 + # via + # jupyterlab + # notebook +numpy==2.2.6 + # via + # data-formulator + # db-dtypes + # pandas + # scikit-learn + # scipy + # yfinance +openai==2.16.0 + # via + # data-formulator + # litellm +ordered-set==4.1.0 + # via flask-limiter +overrides==7.7.0 ; python_full_version < '3.12' + # via jupyter-server +packaging==26.0 + # via + # build + # db-dtypes + # google-cloud-bigquery + # huggingface-hub + # ipykernel + # jupyter-events + # jupyter-server + # jupyterlab + # jupyterlab-server + # limits + # nbconvert +pandas==2.3.3 + # via + # data-formulator + # db-dtypes + # vega-datasets + # yfinance +pandocfilters==1.5.1 + # via nbconvert +parso==0.8.5 + # via jedi +peewee==3.19.0 + # via yfinance +pexpect==4.9.0 ; sys_platform != 'emscripten' and sys_platform != 'win32' + # via ipython +platformdirs==4.5.1 + # via + # jupyter-core + # yfinance +prometheus-client==0.24.1 + # via jupyter-server +prompt-toolkit==3.0.52 + # via + # ipython + # jupyter-console +propcache==0.4.1 + # via + # aiohttp + # yarl +proto-plus==1.27.0 + # via google-api-core +protobuf==6.33.5 + # via + # google-api-core + # googleapis-common-protos + # grpcio-status + # proto-plus + # yfinance +psutil==7.2.2 + # via ipykernel +ptyprocess==0.7.0 ; os_name != 'nt' or (sys_platform != 'emscripten' and sys_platform != 'win32') + # via + # pexpect + # terminado +pure-eval==0.2.3 + # via stack-data +pyarrow==23.0.0 + # via db-dtypes +pyasn1==0.6.2 + # via + # pyasn1-modules + # rsa +pyasn1-modules==0.4.2 + # via google-auth +pycparser==3.0 ; implementation_name != 'PyPy' + # via cffi +pydantic==2.12.5 + # via + # litellm + # openai +pydantic-core==2.41.5 + # via pydantic +pygments==2.19.2 + # via + # ipython + # jupyter-console + # nbconvert +pyjwt==2.11.0 + # via msal +pymongo==4.16.0 + # via data-formulator +pymysql==1.1.2 + # via data-formulator +pyodbc==5.3.0 + # via data-formulator +pyproject-hooks==1.2.0 + # via build +python-dateutil==2.9.0.post0 + # via + # arrow + # azure-kusto-data + # botocore + # google-cloud-bigquery + # jupyter-client + # pandas +python-dotenv==1.2.1 + # via + # data-formulator + # litellm +python-json-logger==4.0.0 + # via jupyter-events +pytz==2025.2 + # via + # pandas + # yfinance +pywinpty==3.0.2 ; os_name == 'nt' + # via + # jupyter-server + # jupyter-server-terminals + # terminado +pyyaml==6.0.3 + # via + # huggingface-hub + # jupyter-events +pyzmq==27.1.0 + # via + # ipykernel + # jupyter-client + # jupyter-console + # jupyter-server +referencing==0.37.0 + # via + # jsonschema + # jsonschema-specifications + # jupyter-events +regex==2026.1.15 + # via tiktoken +requests==2.32.5 + # via + # azure-core + # azure-kusto-data + # google-api-core + # google-cloud-bigquery + # jupyterlab-server + # msal + # tiktoken + # yfinance +rfc3339-validator==0.1.4 + # via + # jsonschema + # jupyter-events +rfc3986-validator==0.1.1 + # via + # jsonschema + # jupyter-events +rfc3987-syntax==1.1.0 + # via jsonschema +rpds-py==0.30.0 + # via + # jsonschema + # referencing +rsa==4.9.1 + # via google-auth +s3transfer==0.16.0 + # via boto3 +scikit-learn==1.7.2 + # via data-formulator +scipy==1.15.3 + # via scikit-learn +send2trash==2.1.0 + # via jupyter-server +setuptools==80.10.2 + # via jupyterlab +shellingham==1.5.4 + # via huggingface-hub +six==1.17.0 + # via + # python-dateutil + # rfc3339-validator +sniffio==1.3.1 + # via openai +soupsieve==2.8.3 + # via beautifulsoup4 +stack-data==0.6.3 + # via ipython +terminado==0.18.1 + # via + # jupyter-server + # jupyter-server-terminals +threadpoolctl==3.6.0 + # via scikit-learn +tiktoken==0.12.0 + # via litellm +tinycss2==1.4.0 + # via bleach +tokenizers==0.22.2 + # via litellm +tornado==6.5.4 + # via + # ipykernel + # jupyter-client + # jupyter-server + # jupyterlab + # notebook + # terminado +tqdm==4.67.2 + # via + # huggingface-hub + # openai +traitlets==5.14.3 + # via + # ipykernel + # ipython + # ipywidgets + # jupyter-client + # jupyter-console + # jupyter-core + # jupyter-events + # jupyter-server + # jupyterlab + # matplotlib-inline + # nbclient + # nbconvert + # nbformat +typer-slim==0.21.1 + # via huggingface-hub +typing-extensions==4.15.0 + # via + # aiosignal + # anyio + # azure-core + # azure-identity + # azure-keyvault-secrets + # azure-storage-blob + # beautifulsoup4 + # flask-limiter + # grpcio + # huggingface-hub + # ipython + # limits + # openai + # pydantic + # pydantic-core + # referencing + # typer-slim + # typing-inspection +typing-inspection==0.4.2 + # via pydantic +tzdata==2025.3 + # via + # arrow + # pandas +uri-template==1.3.0 + # via jsonschema +urllib3==2.6.3 + # via + # botocore + # requests +vega-datasets==0.9.0 + # via data-formulator +vl-convert-python==1.9.0.post1 + # via data-formulator +wcwidth==0.5.2 + # via prompt-toolkit +webcolors==25.10.0 + # via jsonschema +webencodings==0.5.1 + # via + # bleach + # tinycss2 +websocket-client==1.9.0 + # via jupyter-server +websockets==16.0 + # via yfinance +werkzeug==3.1.5 + # via + # flask + # flask-cors +widgetsnbextension==4.0.15 + # via ipywidgets +wrapt==2.0.1 + # via deprecated +yarl==1.22.0 + # via aiohttp +yfinance==1.1.0 + # via data-formulator +zipp==3.23.0 + # via importlib-metadata # Install data_formulator itself in editable mode --e . \ No newline at end of file +-e . diff --git a/src/app/App.tsx b/src/app/App.tsx index 209f3ac5..e4d09a4c 100644 --- a/src/app/App.tsx +++ b/src/app/App.tsx @@ -10,8 +10,8 @@ import { dfActions, dfSelectors, fetchAvailableModels, - getSessionId, } from './dfSlice' +import { getBrowserId } from './identity'; import { red, purple, blue, brown, yellow, orange, } from '@mui/material/colors'; @@ -68,7 +68,7 @@ import KeyboardArrowDownIcon from '@mui/icons-material/KeyboardArrowDown'; import UploadFileIcon from '@mui/icons-material/UploadFile'; import DownloadIcon from '@mui/icons-material/Download'; import { handleDBDownload } from '../views/DBTableManager'; -import { getUrls } from './utils'; +import { getUrls, fetchWithIdentity } from './utils'; import { UnifiedDataUploadDialog } from '../views/UnifiedDataUploadDialog'; import ChatIcon from '@mui/icons-material/Chat'; import { AgentRulesDialog } from '../views/AgentRulesDialog'; @@ -159,7 +159,7 @@ export const ImportStateButton: React.FC<{}> = ({ }) => { } export const ExportStateButton: React.FC<{}> = ({ }) => { - const sessionId = useSelector((state: DataFormulatorState) => state.sessionId); + const identity = useSelector((state: DataFormulatorState) => state.identity); const tables = useSelector((state: DataFormulatorState) => state.tables); const fullStateJson = useSelector((state: DataFormulatorState) => { // Fields to exclude from serialization @@ -168,7 +168,7 @@ export const ExportStateButton: React.FC<{}> = ({ }) => { 'selectedModelId', 'testedModels', 'dataLoaderConnectParams', - 'sessionId', + 'identity', 'agentRules', 'serverConfig', ]); @@ -197,7 +197,7 @@ export const ExportStateButton: React.FC<{}> = ({ }) => { a.click(); } let firstTableName = tables.length > 0 ? tables[0].id: ''; - download(fullStateJson, `df_state_${firstTableName}_${sessionId?.slice(0, 4)}.json`, 'text/plain'); + download(fullStateJson, `df_state_${firstTableName}_${identity.id.slice(0, 4)}.json`, 'text/plain'); }} startIcon={} > @@ -241,7 +241,7 @@ const TableMenu: React.FC = () => { const SessionMenu: React.FC = () => { const [anchorEl, setAnchorEl] = useState(null); const open = Boolean(anchorEl); - const sessionId = useSelector((state: DataFormulatorState) => state.sessionId); + const identity = useSelector((state: DataFormulatorState) => state.identity); const tables = useSelector((state: DataFormulatorState) => state.tables); const theme = useTheme(); @@ -274,12 +274,12 @@ const SessionMenu: React.FC = () => { database file - {sessionId && tables.some(t => t.virtual) && + {tables.some(t => t.virtual) && This session contains data stored in the database, export and reload the database to resume the session later. } - t.virtual)} onClick={() => { - handleDBDownload(sessionId ?? ''); + t.virtual)} onClick={() => { + handleDBDownload(identity.id); }}> {}}> -