From 95ca9a49cee52e5edfdcb64f9e3898f7cf689795 Mon Sep 17 00:00:00 2001 From: sam Date: Tue, 9 Dec 2025 18:12:37 +0000 Subject: [PATCH 1/4] initial refactor --- api-reference/evals/init_eval.mdx | 10 +- api-reference/evals/save_eval_datapoints.mdx | 12 +- api-reference/evals/update_eval_datapoint.mdx | 10 +- api-reference/evaluators/score.mdx | 8 + api-reference/introduction.mdx | 21 +- api-reference/queues/push.mdx | 8 + api-reference/sql/sql_query.mdx | 14 +- changelog/index.mdx | 95 +++++ custom-dashboards/overview.mdx | 94 +++-- datasets/introduction.mdx | 122 ++----- docs.json | 149 ++++---- evaluations/configuration.mdx | 2 +- evaluations/introduction.mdx | 63 ++-- evaluations/online-evaluations.mdx | 2 +- evaluations/quickstart.mdx | 337 ++++-------------- guides/evaluating-tool-calls.mdx | 10 +- guides/nextjs-aisdk.mdx | 2 +- guides/nextjs.mdx | 2 +- installation.mdx | 127 ++----- overview.mdx | 94 +++-- playground/introduction.mdx | 43 ++- projects/introduction.mdx | 2 +- queues/quickstart.mdx | 81 ++--- snippets/Trace.mdx | 58 +++ sql-editor/introduction.mdx | 111 ++---- sql-editor/overview.mdx | 250 ++----------- sql-editor/reference.mdx | 2 +- style.css | 125 +++++++ tracing/integrations/anthropic.mdx | 182 ++++------ tracing/integrations/nextjs.mdx | 165 +++------ tracing/integrations/openai.mdx | 181 ++++------ tracing/integrations/vercel-ai-sdk.mdx | 64 ++-- tracing/introduction.mdx | 116 ------ tracing/otel.mdx | 8 +- tracing/quickstart.mdx | 163 ++++++--- tracing/share-spans.mdx | 38 ++ tracing/structure/manual-span-creation.mdx | 4 +- tracing/structure/overview.mdx | 95 ----- tracing/structure/providers.mdx | 4 +- tracing/troubleshooting-opentelemetry.mdx | 4 +- tracing/troubleshooting.mdx | 2 +- 41 files changed, 1191 insertions(+), 1689 deletions(-) create mode 100644 api-reference/evaluators/score.mdx create mode 100644 api-reference/queues/push.mdx create mode 100644 changelog/index.mdx create mode 100644 snippets/Trace.mdx create mode 100644 style.css delete mode 100644 tracing/introduction.mdx create mode 100644 tracing/share-spans.mdx delete mode 100644 tracing/structure/overview.mdx diff --git a/api-reference/evals/init_eval.mdx b/api-reference/evals/init_eval.mdx index 6c31996..090fb87 100644 --- a/api-reference/evals/init_eval.mdx +++ b/api-reference/evals/init_eval.mdx @@ -1,8 +1,8 @@ --- -title: 'Initialize Evaluation' -openapi: 'POST /v1/evals' +title: "Trigger or update an evaluation" +method: POST +path: /v1/evals +description: Create or trigger an evaluation run. --- -### Description - -Create a new evaluation with an optional name and group. If no name is provided, a random name will be generated automatically. \ No newline at end of file +Create or trigger an evaluation run. If no name is provided, Laminar generates one automatically. diff --git a/api-reference/evals/save_eval_datapoints.mdx b/api-reference/evals/save_eval_datapoints.mdx index f2af70d..40960c9 100644 --- a/api-reference/evals/save_eval_datapoints.mdx +++ b/api-reference/evals/save_eval_datapoints.mdx @@ -1,12 +1,12 @@ --- -title: 'Save Evaluation Datapoints' -openapi: 'POST /v1/evals/{eval_id}/datapoints' +title: "Add evaluation datapoints" +method: POST +path: /v1/evals/{eval_id}/datapoints +description: Add datapoints to an existing evaluation. --- -### Description - -Save multiple evaluation datapoints to an existing evaluation. Each datapoint can include input, output, expected output, executor output, scores, and metadata. +Add multiple evaluation datapoints to an existing evaluation. Each datapoint can include input, output, expected output, executor output, scores, and metadata. The actual datapoints are not saved until you call the [Update Evaluation Datapoint](/api-reference/evals/update_eval_datapoint) endpoint. - \ No newline at end of file + diff --git a/api-reference/evals/update_eval_datapoint.mdx b/api-reference/evals/update_eval_datapoint.mdx index d8952e9..2bf0f71 100644 --- a/api-reference/evals/update_eval_datapoint.mdx +++ b/api-reference/evals/update_eval_datapoint.mdx @@ -1,8 +1,8 @@ --- -title: 'Update Evaluation Datapoint' -openapi: 'POST /v1/evals/{eval_id}/datapoints/{datapoint_id}' +title: "Update evaluation datapoint" +method: POST +path: /v1/evals/{eval_id}/datapoints/{datapoint_id} +description: Update a specific datapoint with new output or scores. --- -### Description - -Update a specific evaluation datapoint with new executor output and scores. \ No newline at end of file +Update a specific evaluation datapoint with new executor output, scores, or metadata. diff --git a/api-reference/evaluators/score.mdx b/api-reference/evaluators/score.mdx new file mode 100644 index 0000000..bfd5347 --- /dev/null +++ b/api-reference/evaluators/score.mdx @@ -0,0 +1,8 @@ +--- +title: "Score via evaluator" +method: POST +path: /v1/evaluators/score +description: Score outputs using a configured evaluator. +--- + +Use a configured evaluator to score outputs (e.g., LLM-as-a-judge or code evaluators). diff --git a/api-reference/introduction.mdx b/api-reference/introduction.mdx index 4de49f0..43fadd1 100644 --- a/api-reference/introduction.mdx +++ b/api-reference/introduction.mdx @@ -1,15 +1,24 @@ --- -title: 'Overview' -description: 'General guidelines on using our API' +title: "API reference" +description: "Endpoints to trigger evals, push to queues, and query data." --- -## General +## Base URL Use the following base URL: `https://api.lmnr.ai/v1` -For example, `POST https://api.lmnr.ai/v1/sql/query` +Example: `POST https://api.lmnr.ai/v1/sql/query` -For more detailed information about each endpoint or schema, check our OpenAPI specification. +## Endpoints + +- `POST /v1/evals` — trigger or update evals (create/trigger an eval run) +- `POST /v1/evals/{eval_id}/datapoints` — add eval datapoints +- `POST /v1/evals/{eval_id}/datapoints/{datapoint_id}` — update a specific datapoint +- `POST /v1/evaluators/score` — score via evaluator +- `POST /v1/queues/push` — push items to a labeling queue +- `POST /v1/sql/query` — run a SQL query + +For schemas and parameters, see the OpenAPI spec. -Each endpoint's page in OpenAPI specification specifies the method, path and parameters to be used. Additionally, you can try sending the request from there. - ## Authentication All API endpoints are authenticated using Project API key as Bearer token. diff --git a/api-reference/queues/push.mdx b/api-reference/queues/push.mdx new file mode 100644 index 0000000..94303e3 --- /dev/null +++ b/api-reference/queues/push.mdx @@ -0,0 +1,8 @@ +--- +title: "Push to labeling queue" +method: POST +path: /v1/queues/push +description: Push items into a labeling queue. +--- + +Add items to a labeling queue. Useful for turning traces or datasets into labeled examples. diff --git a/api-reference/sql/sql_query.mdx b/api-reference/sql/sql_query.mdx index a7a5af7..131a2d0 100644 --- a/api-reference/sql/sql_query.mdx +++ b/api-reference/sql/sql_query.mdx @@ -1,17 +1,17 @@ --- -title: SQL Query -sidebarTitle: SQL Query -openapi: /api-reference/openapi.json POST /v1/sql/query +title: "Run SQL query" +sidebarTitle: "Run SQL query" +method: POST +path: /v1/sql/query +description: Execute a SQL query against Laminar data. --- -## SQL Query - -You can run SQL queries on your data stored in Laminar using the SQL query API. Learn more in the [SQL Editor](/sql-editor/introduction) reference. +Execute SQL against Laminar data (spans, traces, evals). Learn more in the [SQL Editor](/sql-editor/introduction) reference. ### Example request ```json { - "query": "SELECT * FROM spans where start_time > now() - interval '1 hour' LIMIT 10" + "query": "SELECT * FROM spans WHERE start_time > now() - interval '1 hour' LIMIT 10" } ``` diff --git a/changelog/index.mdx b/changelog/index.mdx new file mode 100644 index 0000000..e19e6f5 --- /dev/null +++ b/changelog/index.mdx @@ -0,0 +1,95 @@ +--- +title: "Changelog" +description: "Key releases and updates" +--- + +
+
+
Dec 3, 2025
+

Claude Agent SDK instrumentation

+

Automatic tracing for Claude Agent SDK with a lightweight Rust proxy.

+
    +
  • Captures agent actions, inputs/outputs, tokens, and cost
  • +
  • Works with existing Laminar tracing
  • +
+
+ +
+
Nov 2025
+

SDK updates (v0.7.x)

+

Reliability fixes and better ESM support.

+
    +
  • Python SDK v0.7.22
  • +
  • TypeScript SDK v0.7.6
  • +
+
+ +
+
Oct 2025
+

Index browser agent API

+

Serverless API for running browser agents in production with full observability.

+
    +
  • Supports Gemini, Claude, and OpenAI models
  • +
  • Structured outputs with Pydantic schemas
  • +
  • Interactive CLI for testing
  • +
+
+ +
+
Sep 2025
+

LangGraph visualization & Skyvern integration

+

Visualize LangGraph execution flows and trace Skyvern automation with session recordings.

+
+ +
+
Aug 2025
+

Custom dashboards & SQL editor

+

Query all Laminar data with SQL and build dashboards. Backed by ClickHouse for sub-second reads.

+
+ +
+
Jul 2025
+

Browser agent observability suite

+

Session recordings synced with traces; real-time spans and live cost tracking.

+
    +
  • Supports Playwright, Puppeteer, Stagehand, Browser Use
  • +
  • 30+ minute sessions with instant playback
  • +
+
+ +
+
Jun 2025
+

Vercel AI SDK integration

+

Automatic tracing for generateText/streamText with Next.js support.

+
+ +
+
Jun 2025
+

Stagehand integration

+

Trace Stagehand runs with session recordings and per-step cost.

+
+ +
+
May 2025
+

Integration expansion

+

Support for Gemini, Mistral, Bedrock, Groq, Cohere, CrewAI, LiteLLM, plus improved OpenAI/Anthropic instrumentation.

+
+ +
+
Apr 2025
+

Agent manager service

+

Self-hosted container to manage browser agent infrastructure.

+
+ +
+
Dec 2024
+

Launch Week #1

+

Flow (dynamic task engine), Evaluations SDK, Semantic Search API, Labeling Queues, Online Evaluations.

+
+ +
+
Summer 2024
+

Core platform launch

+

Automatic tracing (OpenAI, Anthropic, LangChain), @observe decorator, datasets, playground, self-hosting.

+
+
diff --git a/custom-dashboards/overview.mdx b/custom-dashboards/overview.mdx index 0531a0e..02212f2 100644 --- a/custom-dashboards/overview.mdx +++ b/custom-dashboards/overview.mdx @@ -1,67 +1,59 @@ --- -title: "Creating dashboards to track metrics of AI agents" -sidebarTitle: "Overview" +title: Create dashboards to track your AI app +sidebarTitle: Overview --- - - Laminar Custom Dashboard Introduction - +Build dashboards on top of your traces, spans, events, and eval scores without extra infrastructure. Track costs, latency, accuracy, or any custom metric in a few clicks. -## What You Can Track + +Embed placeholder: GIF of creating a chart from a query and pinning it to a dashboard (tokens/cost by route). + -Custom Dashboards are built on top of our powerful query engine, working across all your observability data - `traces`, `spans`, `events`, and `tags`. You can track any metric that matters to your application. +## What you'll build -For detailed information on available entities, fields, and how to select the right data for your charts, see the [SQL Editor Reference](/sql-editor/reference). -## How to Build Charts +- A dashboard tile showing the metrics that matter (cost, tokens, latency, accuracy). +- Filters and group-bys to break down by model, route, user, team, or tag. +- Shareable dashboards for eng/support/research. -To create a chart, navigate to **dashboard** menu and click the **`+ Chart`** button in the upper right corner. +## Copy/paste workflow - - Chart Builder flow - +1. Open **Dashboard** → click **`+ Chart`**. +2. Choose a visualization (line, bar, horizontal bar for rankings). +3. Select a source (`spans`, `traces`, `events`, `evaluation_scores`). +4. Define metrics (count/sum/avg/p90/p95/p99) and group by model/route/tag. +5. Save to a dashboard and resize/arrange tiles. -**The process:** + +Need a custom query? Use the SQL editor to craft it, then add the result as a chart. + -1. **Pick visualization**: - - **Line Chart**: For time series visualization. We automatically prefill and group data by time range, perfect for tracking trends over time. - - **Bar Chart**: Another alternative to visualize time series data, useful when you want to emphasize individual time periods. - - **Horizontal Bar Chart**: For visualizations that need to be ranked, similar to a sortable table. Use this to compare and rank items (e.g., top users, models by cost). +## Example recipes -2. **Select data source**: Traces, Spans, Events, and Tags +**Total tokens by model (identify spend hotspots)** +- Source: `spans` +- Metric: `sum(total_tokens)` +- Group by: `model` +- Filter: `span_type = 'LLM'` +- Visualization: Line chart -3. **Define metrics**: What to measure (count, sum, avg, min, max, p90, p95, p99) +**p90 cost by provider (compare pricing drift)** +- Source: `spans` +- Metric: `p90(cost)` +- Group by: `provider` +- Visualization: Line chart -4. **Add context**: Filters to narrow scope, grouping to break down by dimensions, order by fields, limits for top N results +**Top slow routes (find regressions)** +- Source: `traces` +- Metric: `p95(duration_ms)` +- Group by: `route` +- Visualization: Horizontal bar -5. **Save and customize**: Add to dashboard, resize as needed + +Embed placeholder: dashboard screenshot with the three example charts; add “View live trace” buttons on hover. + -## Examples +## Build this next -### Total Tokens by Model - -See which models consume the most tokens to identify where your LLM costs are going. Use this to decide if you're using the right model for each use case. - - - Total tokens by model chart - - -**How to build:** -- Chart type: Line Chart -- Table: `spans` -- Metric: `total_tokens` with `sum` aggregation -- Group by: `model` -- Filter: `span_type` = `LLM` (to include only LLM calls) - -### p90 Cost by Provider - -Track cost trends across different LLM providers over time. The p90 metric shows what most of your expensive requests cost, helping you compare provider pricing and spot cost increases. - - - p90 cost by provider chart - - -**How to build:** -- Chart type: Line Chart -- Table: `spans` -- Metric: `cost` with `p90` aggregation -- Group by: `provider` +- Write custom queries → [SQL editor](/sql-editor/introduction) +- Export query results to datasets → [Export & datasets](/sql-editor/overview) +- Pipe eval scores into dashboards → [Evaluations](/evaluations/introduction) diff --git a/datasets/introduction.mdx b/datasets/introduction.mdx index 026a135..4bed4f5 100644 --- a/datasets/introduction.mdx +++ b/datasets/introduction.mdx @@ -1,112 +1,52 @@ --- sidebarTitle: Introduction -title: Introduction to Laminar datasets +title: Datasets for evals and training --- -## Concept +Datasets in Laminar hold the examples that power your evals, labeling, and training loops. Each datapoint is JSON you can version, filter, and connect back to traces. -Dataset is a collection of datapoints. It can be used for the following purposes: -1. Data storage for use in future fine-tuning or prompt-tuning. -1. Provide inputs and expected outputs for [Evaluations](/evaluations/introduction). + +Embed placeholder: screenshot of a dataset table with data/target/metadata columns and a “View trace” link. + -## Format +## What you'll do with datasets -Every datapoint has two fixed JSON objects: `data` and `target`, each with arbitrary keys. -`target` is only used in evaluations. +- Store inputs + expected outputs for evaluations. +- Turn traces into labeled examples, then iterate in labeling queues. +- Export query results (cost outliers, bad traces) straight into a dataset. -- `data` – the actual datapoint data, -- `target` – data additionally sent to the evaluator function. -- `metadata` – arbitrary key-value metadata about the datapoint. - -For every key inside `data` and `target`, the value can be any JSON value. - -### Example - -This is an example of a valid datapoint. +## Datapoint shape ```json { - "data": { - "color": "red", - "size": "large", - "messages": [ - { - "role": "user", - "content": "Hello, can you help me choose a T-shirt?" - }, - { - "role": "assistant", - "content": "I'm afraid, we don't sell T-shirts" - } - ] - }, - "target": { - "expected_output": "Of course! What size and color are you looking for?" - } + "data": { "question": "What is the capital of France?" }, + "target": { "answer": "Paris" }, + "metadata": { "category": "geography" } } ``` -## Use case: Evaluations - -Datasets can be used for evaluations to specify inputs and expected outputs. - -You will need to make sure the dataset keys match the input and output node names of the pipelines. -See more in the [Evaluations](/evaluations/introduction) page. - -## Editing - -Datasets are editable. You can edit the datapoints by clicking on the datapoint and -editing the data in JSON. Changes are saved as a new datapoint version. +- `data`: the input to your executor. +- `target`: optional reference passed to evaluators. +- `metadata`: tags for filtering and grouping. -### Versioning +## Versioning (built-in) -Each datapoint has a unique id and a `created_at` timestamp. Every time you -edit a datapoint, under the hood, -a new datapoint version is created with the same id, -but the `created_at` timestamp is updated. +- Every edit creates a new version with the same id and a new timestamp. +- Reverting creates a new top version; history stays intact. +- Sort with UUIDv7 to preserve insertion order. -The version stack is push-only. That is, when you revert to a previous version, -a copy of that version is created and added as a current version. +## Common workflows -Example: +- **Feed evals**: wire a dataset into [evaluate](/evaluations/quickstart) to score prompts/agents. +- **Label from traces**: push spans into a queue, label targets, and write back to the dataset. +- **Export from SQL**: query outliers in the [SQL editor](/sql-editor/introduction) and export to a dataset. -- Initial version (v1): -```json -{ - "id": "019a3122-ca78-7d75-91a7-a860526895b2", - "created_at": "2025-01-01T00:00:00.000Z", - "data": { "key": "initial value" } -} -``` -- Version 2 (v2): -```json -{ - "id": "019a3122-ca78-7d75-91a7-a860526895b2", - "created_at": "2025-01-05T00:00:05.000Z", - "data": { "key": "value at v2" } -} -``` -- Version 3 (v3): -```json -{ - "id": "019a3122-ca78-7d75-91a7-a860526895b2", - "created_at": "2025-01-10T00:00:10.000Z", - "data": { "key": "value at v3" } -} -``` - -After this, you want to update to version 1 (initial version). This will create a new version (v4) with the same id, but the `created_at` timestamp is updated. - -- Version 4 (v4): -```json -{ - "id": "019a3122-ca78-7d75-91a7-a860526895b2", - "created_at": "2025-01-15T00:00:15.000Z", - "data": { "key": "initial value" } -} -``` + +Embed placeholder: GIF of selecting traces → exporting to dataset → running an eval. + -### Datapoint id +## Build this next -When you push a new datapoint to a dataset, a UUIDv7 is generated for it. -This allows to sort datapoints by their creation order and preserve the order of insertion. +- Create/load datasets programmatically → [Datasets CLI](/datasets/cli) +- Label quickly → [Labeling queues](/queues/quickstart) +- Run evals on your dataset → [Evaluations quickstart](/evaluations/quickstart) diff --git a/docs.json b/docs.json index 0c35654..db30797 100644 --- a/docs.json +++ b/docs.json @@ -1,6 +1,6 @@ { "$schema": "https://mintlify.com/docs.json", - "theme": "mint", + "theme": "maple", "name": "Laminar documentation", "colors": { "primary": "#ED6E40", @@ -12,9 +12,11 @@ "tabs": [ { "tab": "Documentation", + "icon": "book-open", "groups": [ { - "group": "Overview", + "group": "Getting Started", + "icon": "rocket", "pages": [ "overview", "installation", @@ -25,39 +27,49 @@ "self-hosting/access-control-setup" ] }, + "tracing/quickstart", "cursor" ] }, { - "group": "Tracing", + "group": "Model providers", + "pages": [ + "tracing/integrations/openai", + "tracing/integrations/anthropic", + "tracing/integrations/gemini", + "tracing/integrations/cohere", + "tracing/integrations/litellm" + ] + }, + { + "group": "Frameworks & SDKs", + "pages": [ + "tracing/integrations/vercel-ai-sdk", + "tracing/integrations/nextjs", + "tracing/integrations/langchain", + "tracing/integrations/kernel", + "tracing/integrations/claude-agent-sdk" + ] + }, + { + "group": "Browser & agents", + "pages": [ + "tracing/integrations/browser-use", + "tracing/integrations/stagehand", + "tracing/integrations/skyvern", + "tracing/integrations/playwright", + "tracing/integrations/puppeteer" + ] + }, + { + "group": "Debug & Optimize", + "icon": "bug", "pages": [ - "tracing/introduction", - "tracing/quickstart", "tracing/automatic-instrumentation", - { - "group": "Integrations", - "pages": [ - "tracing/integrations/openai", - "tracing/integrations/anthropic", - "tracing/integrations/gemini", - "tracing/integrations/langchain", - "tracing/integrations/cohere", - "tracing/integrations/vercel-ai-sdk", - "tracing/integrations/nextjs", - "tracing/integrations/litellm", - "tracing/integrations/kernel", - "tracing/integrations/claude-agent-sdk", - "tracing/integrations/browser-use", - "tracing/integrations/stagehand", - "tracing/integrations/skyvern", - "tracing/integrations/playwright", - "tracing/integrations/puppeteer" - ] - }, { "group": "Tracing Structure", + "icon": "hierarchy", "pages": [ - "tracing/structure/overview", "tracing/structure/observe", "tracing/structure/manual-span-creation", "tracing/structure/session", @@ -74,6 +86,7 @@ "tracing/browser-agent-observability", "tracing/langgraph-visualization", "tracing/events", + "tracing/share-spans", "tracing/otel", { "group": "Troubleshooting", @@ -85,7 +98,8 @@ ] }, { - "group": "Evaluations", + "group": "Evaluate", + "icon": "chart-line", "pages": [ "evaluations/introduction", "evaluations/quickstart", @@ -106,36 +120,23 @@ ] }, { - "group": "SQL Editor", - "pages": [ - "sql-editor/introduction", - "sql-editor/overview", - "sql-editor/reference" - ] - }, - { - "group": "Custom Dashboards", - "pages": [ - "custom-dashboards/overview" - ] - }, - { - "group": "Datasets", + "group": "Datasets & Labeling", + "icon": "database", "pages": [ "datasets/introduction", "datasets/adding-data", - "datasets/cli" - ] - }, - { - "group": "Labeling Queues", - "pages": [ + "datasets/cli", "queues/quickstart" ] }, { - "group": "Playground", + "group": "Analyze & Visualize", + "icon": "chart-pie", "pages": [ + "sql-editor/introduction", + "sql-editor/overview", + "sql-editor/reference", + "custom-dashboards/overview", "playground/introduction", "playground/playground-from-span", "playground/tools", @@ -145,37 +146,33 @@ ] }, { - "tab": "Guides", + "tab": "Recipes", + "icon": "map", "pages": [ "guides/fastapi", - "guides/nextjs-aisdk", "guides/nextjs", + "guides/nextjs-aisdk", "guides/evaluating-tool-calls" ] }, + { + "tab": "Changelog", + "icon": "megaphone", + "pages": [ + "changelog/index" + ] + }, { "tab": "API Reference", - "groups": [ - { - "group": "API Documentation", - "pages": [ - "api-reference/introduction", - { - "group": "Evaluations", - "pages": [ - "api-reference/evals/init_eval", - "api-reference/evals/save_eval_datapoints", - "api-reference/evals/update_eval_datapoint" - ] - }, - { - "group": "SQL", - "pages": [ - "api-reference/sql/sql_query" - ] - } - ] - } + "icon": "code", + "pages": [ + "api-reference/introduction", + "api-reference/evals/init_eval", + "api-reference/evals/save_eval_datapoints", + "api-reference/evals/update_eval_datapoint", + "api-reference/evaluators/score", + "api-reference/queues/push", + "api-reference/sql/sql_query" ] } ], @@ -194,6 +191,12 @@ } }, "logo": "/logo/logo.png", + "styling": { + "breadcrumbs": true, + "css": [ + "/style.css" + ] + }, "api": { "openapi": [ "api-reference/openapi.json" @@ -238,4 +241,4 @@ "apiHost": "https://p.lmnr.ai" } } -} \ No newline at end of file +} diff --git a/evaluations/configuration.mdx b/evaluations/configuration.mdx index 0778ab1..b992b7e 100644 --- a/evaluations/configuration.mdx +++ b/evaluations/configuration.mdx @@ -1,7 +1,7 @@ --- sidebarTitle: Configuration title: Configuring Laminar evaluations -description: This page describes how to configure evaluations in Laminar and showcases some common use cases. +description: Configure evaluations in Laminar and see common use cases. --- ## Configuring evaluations to report results to locally self-hosted Laminar diff --git a/evaluations/introduction.mdx b/evaluations/introduction.mdx index c76666f..82a26c3 100644 --- a/evaluations/introduction.mdx +++ b/evaluations/introduction.mdx @@ -1,48 +1,47 @@ --- sidebarTitle: Introduction -title: Introduction to Laminar evaluations +title: Ship reliable AI with Laminar evaluations --- -import GetProjectApiKey from '/snippets/get-project-api-key.mdx'; -Evaluation is the process of validating and testing the outputs that your AI applications are producing. Having strong evaluations ("evals") means a more stable, reliable application that is resilient to code and model changes. An eval is a task used to measure the quality of the output of an LLM or LLM system. +Evals in Laminar give you a fast way to score prompts, agents, and pipelines, then drill into every datapoint via traces. See what you can do and why it matters before diving into reference details. -
-Screenshot of a trace visualization -
+ +Hero placeholder: embed a dashboard screenshot/GIF showing eval scores with clickable trace links. + -### Why do we need evals? +## What you'll build -In short, evaluations bring rigor to AI development process. +- An evaluation that runs your executor (prompt/agent/code) on a dataset. +- Scoring via simple functions or hosted graders. +- Results you can sort/filter, with trace links for every datapoint. -When you are building with foundation models, creating high-quality evals is one of the most impactful things you can do. Developing AI solutions involves an iterative design process. Without evals, it can be difficult and time-intensive to understand how different model versions and prompts affect your use case. +## Copy/paste/run (minimal setup) -With continuous model upgrades from providers, evals allow you to efficiently test model performance for your specific uses in a standardized way. Developing a suite of evals customized to your objectives will help you quickly understand how new models perform for your applications. You can also make evals part of your CI/CD pipeline to ensure desired accuracy before deployment. +1. Install Laminar + your model SDK. +2. Write one `evaluate` call with your executor + evaluator. +3. Run `lmnr eval ...` (CLI finds the evaluate call). +4. Click the dashboard link to inspect scores and traces. -### Types of evals +See the [Evaluations quickstart](/evaluations/quickstart) for full code in TypeScript and Python. -There are two main approaches to evaluating outputs: +## Why evaluations? -**1. Logic-based evaluation**: The simplest and most common type uses code to check outputs against expected answers. For example: -- String matching to check if the completion includes an expected phrase -- Parsing to validate proper JSON output -- Custom logic to verify domain-specific requirements +- **Guardrails for change**: catch regressions when you update models, prompts, or tools. +- **Compare variants**: run side-by-side prompts/agents and sort by scores. +- **Trace-first debugging**: every datapoint has a trace so you can see why it passed/failed. +- **CI-friendly**: wire into pipelines to block risky changes. -**2. Model-based evaluation**: A two-stage process where: -- First, the model generates a response to the input -- Then, another model (ideally more powerful) evaluates the quality of that response +## How Laminar structures evals -Model-based evaluation works best with powerful models when the desired output has significant variation, such as open-ended questions or creative tasks. +- **Dataset**: list of datapoints with `data` (input), optional `target`, and metadata for filtering. +- **Executor**: your code that produces an output (LLM call, agent step, business logic). +- **Evaluator(s)**: functions or hosted graders that score outputs; can return one or many numeric scores. +- **Groups**: label related runs to track progress over time. +- **Tracing**: automatically records executor + evaluator spans with inputs/outputs. -## How evals differ from traditional unit tests +## Build this next -Unlike traditional unit tests that focus on binary pass/fail outcomes, evaluations for AI systems require continuous tracking and visualization of performance metrics over time. As models evolve and prompts are refined, being able to compare performance across different versions becomes critical. - -What makes evals unique is their ability to: -- Track nuanced quality metrics beyond simple correctness -- Visualize performance trends across model versions and prompt iterations -- Compare multiple implementations side-by-side -- Detect subtle regressions that might not be obvious in isolated tests - -Laminar provides the best developer experience and visualization capabilities for AI evaluations, making it easy to understand how your models are performing and where improvements can be made. With comprehensive dashboards and detailed tracing, you can get deep insights into every aspect of your AI system's behavior. - -Check out our [Quickstart Guide](/evaluations/quickstart) to run your first evaluation. \ No newline at end of file +- Run your first eval in 5 minutes → [Quickstart](/evaluations/quickstart) +- Use hosted graders → [Online evaluators](/evaluations/online-evaluators/introduction) +- Create/share datasets → [Datasets](/datasets/introduction) +- Pipe results into dashboards → [Custom dashboards](/custom-dashboards/overview) diff --git a/evaluations/online-evaluations.mdx b/evaluations/online-evaluations.mdx index b2bfbc7..b853155 100644 --- a/evaluations/online-evaluations.mdx +++ b/evaluations/online-evaluations.mdx @@ -24,7 +24,7 @@ and run evaluations post-factum. - **Span path** – this is an identifier of your LLM function. It is constructed from the location of the call within your code, and must ideally be unique. For more -information about what a span is, see the [tracing documentation](/tracing/introduction). +information about what a span is, see the [tracing quickstart](/tracing/quickstart). - **Span label** – a label attached to a span. It can be a boolean or a categorical label. The label can be both set manually or by an evaluator. - **Evaluator** – the function evaluating the input and output of your LLM call. It diff --git a/evaluations/quickstart.mdx b/evaluations/quickstart.mdx index c6c096f..8d5cf2c 100644 --- a/evaluations/quickstart.mdx +++ b/evaluations/quickstart.mdx @@ -1,151 +1,93 @@ --- -title: Get started with Laminar evaluations +title: Run your first Laminar evaluation sidebarTitle: Quickstart --- import GetProjectApiKey from '/snippets/get-project-api-key.mdx'; -This guide will walk you through running your first evaluation using Laminar's evaluation system. +Run a complete evaluation from inputs to scores with trace links in under 5 minutes. -Laminar provides a structured approach to create, run, and track your AI system's performance through these key components: + +Embed placeholder: short GIF showing `lmnr eval my-first-evaluation.ts` and the resulting dashboard with clickable traces. + -- **Executors** - Functions that process inputs and produce outputs, such as prompt templates, LLM calls, or production logic -- **Evaluators** - Functions that assess outputs against targets or quality criteria, producing numeric scores -- **Datasets** - Collections of datapoints (test cases) with 3 key elements: - - `data` - Required JSON input sent to the executor - - `target` - Optional reference data sent to the evaluator, typically containing expected outputs - - `metadata` - Optional metadata. This can be used to filter evaluation results in the UI after the evaluation is run. -- **Visualization** - Tools to track performance trends and detect regressions over time -- **Tracing** - Automatic recording of execution flow and model invocations +## What you'll build -Example datapoint: -```json5 -{ - "data": { - "question": "What is the capital of France? Respond in one word.", - }, - "target": { - "answer": "Paris" - }, - "metadata": { - "category": "geography" - } -} -``` - -**Evaluation Groups** group related evaluations to assess one feature or component, with results aggregated for comparison. - -## Evaluation Lifecycle - -For each datapoint in a dataset: +- A dataset with a couple of datapoints. +- An executor that calls your model. +- An evaluator that scores the output. +- A dashboard view with scores + trace links for each datapoint. -1. The executor receives the `data` as input -2. The executor runs and its output is stored -3. Both the executor output and `target` are passed to the evaluator -4. The evaluator produces either a numeric score or a JSON object with multiple numeric scores -5. Results are stored and can be visualized to track performance over time +## Why this matters -This approach helps you continuously measure your AI system's performance as you make changes, showing the impact of model updates, prompt revisions, and code changes. +- Catch regressions when you change models, prompts, or tools. +- Compare variants side by side with scores and trace context. +- Keep evals close to production by linking every datapoint to its trace. -### Evaluation function types +## Prerequisites -Each executor takes in the `data` as it is defined in the datapoints. -Evaluator accepts the output of the executor as its first argument, -and `target` as it's defined in the datapoints as the second argument. + -This means that the type of the `data` fields in your datapoints must -match the type of the first parameter of the executor function. Similarly, -the type of the `target` fields in your datapoints must match the type of -the second parameter of the evaluator function(s). +Set `LMNR_PROJECT_API_KEY` in your environment. -Python is a bit more permissive. If you see type errors in TypeScript, -make sure the data types and the parameter types match. +## Copy/paste/run -For a more precise description, here's the partial TypeScript type signature of the `evaluate` function: + + -```javascript -evaluate( - data: { - data: D, - target?: T, - }, - executor: (data: D, ...args: any[]) => O | Promise; - evaluators: { - [key: string]: (output: O, target?: T, ...args: any[]) => - number | { [key: string]: number } - }, - // ... other parameters -) +```bash +npm install @lmnr-ai/lmnr openai ``` -See full reference [here](/evaluations/reference#typescript-evaluation-types). - -## Create your first evaluation - -### Prerequisites - - - -### Create an evaluation file - - - - -Create a file named `my-first-evaluation.ts` and add the following code: +Create `my-first-evaluation.ts`: -```javascript my-first-evaluation.ts +```ts my-first-evaluation.ts import { evaluate } from '@lmnr-ai/lmnr'; -import { OpenAI } from 'openai'; +import { OpenAI } from 'openai'; const client = new OpenAI(); -const capitalOfCountry = async (data: {country: string}) => { - // replace this with your LLM call or custom logic +const capitalOfCountry = async (data: { country: string }) => { const response = await client.chat.completions.create({ - model: 'gpt-4.1-nano', + model: 'gpt-4o-mini', messages: [ { role: 'user', - content: `What is the capital of ${data['country']}? ` + - 'Answer only with the capital, no other text.' - } - ] + content: `What is the capital of ${data.country}? Respond in one word.`, + }, + ], }); return response.choices[0].message.content || ''; -} +}; evaluate({ data: [ - { - data: { country: 'France' }, - target: 'Paris', - }, - { - data: { country: 'Germany' }, - target: 'Berlin', - }, + { data: { country: 'France' }, target: 'Paris' }, + { data: { country: 'Germany' }, target: 'Berlin' }, ], executor: capitalOfCountry, evaluators: { - accuracy: (output: string, target: string | undefined): number => { - if (!target) return 0; - return output.includes(target) ? 1 : 0; - } + accuracy: (output, target) => (!target ? 0 : output.includes(target) ? 1 : 0), + }, + config: { + instrumentModules: { openAI: OpenAI }, }, - config: { - instrumentModules: { - openAI: OpenAI - } - } -}) +}); +``` + +Run it: + +```bash +LMNR_PROJECT_API_KEY=your_key npx lmnr eval my-first-evaluation.ts ``` - -It is important to pass the `config` object with `instrumentModules` to `evaluate` to ensure that the OpenAI client and any other instrumented modules are instrumented. - - + -Create a file named `my-first-evaluation.py` and add the following code: +```bash +pip install "lmnr[openai]" openai +``` + +Create `my-first-evaluation.py`: ```python my-first-evaluation.py from lmnr import evaluate @@ -154,176 +96,53 @@ from openai import OpenAI client = OpenAI() def capital_of_country(data: dict) -> str: - # replace this with your LLM call or custom logic - return client.chat.completions.create( - model="gpt-4.1-nano", - messages=[ - { - "role": "user", - "content": f"Generate a poem about {data['country']}. " + - "Answer only with the poem, no other text." - } - ] - ).choices[0].message.content - -def accuracy(output: str, target: str) -> int: + resp = client.chat.completions.create( + model="gpt-4o-mini", + messages=[{"role": "user", "content": f"What is the capital of {data['country']}? Answer in one word."}], + ) + return resp.choices[0].message.content or "" + +def accuracy(output: str, target: str | None) -> int: + if not target: + return 0 return 1 if target in output else 0 evaluate( data=[ - { - "data": {"country": "France"}, - "target": "Paris" - }, - { - "data": {"country": "Germany"}, - "target": "Berlin" - }, + {"data": {"country": "France"}, "target": "Paris"}, + {"data": {"country": "Germany"}, "target": "Berlin"}, ], executor=capital_of_country, - evaluators={"accuracy": accuracy} + evaluators={"accuracy": accuracy}, ) ``` - - - -### Run the evaluation -You can run evaluations in two ways: using the `lmnr eval` CLI or directly executing the evaluation file. - -#### Using the CLI - -The Laminar CLI automatically detects top-level `evaluate` function calls in your files - you don't need to wrap them in a `main` function or any special structure. - - - -```sh -export LMNR_PROJECT_API_KEY= -npx lmnr eval my-first-evaluation.ts -``` +Run it: -To run multiple evaluations, place them in an `evals` directory with the naming pattern `*.eval.{ts,js}`: - -``` -├─ src/ -├─ evals/ -│ ├── my-first-evaluation.eval.ts -│ ├── my-second-evaluation.eval.ts -│ ├── ... -``` - -Then run all evaluations with a single command: -```sh -npx lmnr eval -``` - - -```sh -# 1. Make sure `lmnr` is installed in a virtual environment -# lmnr --help -# 2. Run the evaluation -export LMNR_PROJECT_API_KEY= -lmnr eval my-first-evaluation.py -``` - -To run multiple evaluations, place them in an `evals` directory with the naming pattern `eval_*.py` or `*_eval.py`: - -``` -├─ src/ -├─ evals/ -│ ├── eval_first.py -│ ├── second_eval.py -│ ├── ... -``` - -Then run all evaluations with a single command: -```sh -lmnr eval -``` - - - -#### Running as a standalone script - -You can also import and call `evaluate` directly from your application code: - - - -```bash -ts-node my-first-evaluation.ts -# or -npx tsx my-first-evaluation.ts -``` - - ```bash -python my-first-evaluation.py +LMNR_PROJECT_API_KEY=your_key lmnr eval my-first-evaluation.py ``` + -The `evaluate` function is flexible and can be used both in standalone scripts processed by the CLI and integrated directly into your application code. - - -Evaluator functions must return either a single numeric score or a JSON object where each key is a score name and the value is a numeric score. - - -No need to initialize Laminar - `evaluate` automatically initializes Laminar behind the scenes. All instrumented function calls and model invocations are traced without any additional setup. +No explicit initialization needed. `evaluate` wires up tracing automatically. Passing `instrumentModules` (TS) or installing extras (Python) ensures your model SDK calls are traced alongside the evaluator. -### View evaluation results - -When you run an evaluation from the CLI, Laminar will output the link to the dashboard where you can view the evaluation results. - -Laminar stores every evaluation result. A run for every datapoint is represented as a trace. You can view the results and corresponding traces in the evaluations page. +## What you should see - - Example evaluation - +- CLI prints a link to the evaluation run. +- Dashboard shows scores per datapoint. +- Each row links to a trace; open it to inspect inputs, outputs, tokens, and cost. -## Tracking evaluation progress + +Embed placeholder: screenshot of eval results table with a trace link, plus an open trace showing executor + evaluator spans. + -To track the score progression over time or compare evaluations side-by-side, you need to group them together. This can be achieved by passing the `groupName` parameter to the `evaluate` function. - - - -```javascript {7} -import { evaluate, LaminarDataset } from '@lmnr-ai/lmnr'; - -evaluate({ - data: new LaminarDataset("name_of_your_dataset"), - executor: yourExecutorFunction, - evaluators: {evaluatorName: yourEvaluator}, - groupName: "evals_group_1", -}); -``` - - -```python {9} -from lmnr import evaluate, LaminarDataset -import os - -evaluate( - data=LaminarDataset("name_of_your_dataset"), - executor=your_executor_function, - evaluators={"evaluator_name": your_evaluator}, - project_api_key=os.environ["LMNR_PROJECT_API_KEY"], - group_name="evals_group_1", - # ... other optional parameters -) -``` - - +## Build this next - - Example evaluation progression - \ No newline at end of file +- Use hosted graders → [Scoring with hosted evaluators](/evaluations/online-evaluators/scoring-with-hosted-evaluators) +- Reuse datasets → [Using datasets](/evaluations/using-dataset) +- Track improvements over time → pass `groupName` and view trend charts +- Turn traces into labeled data → [Labeling queues](/queues/quickstart) diff --git a/guides/evaluating-tool-calls.mdx b/guides/evaluating-tool-calls.mdx index f602361..d269bd2 100644 --- a/guides/evaluating-tool-calls.mdx +++ b/guides/evaluating-tool-calls.mdx @@ -1,12 +1,12 @@ --- title: Evaluating LLM Tool Calls with Laminar sidebarTitle: Evaluating Tool Calls -description: A comprehensive guide to evaluating AI agent tool calls using a Data Analysis Assistant example - from production tracing to systematic evaluation +description: A comprehensive guide to evaluating agent tool calls using a Data Analysis Assistant example - from production tracing to systematic evaluation --- ## Overview -In this guide, we'll follow the complete journey of building and improving a **Data Analysis Assistant** - an AI agent that helps users analyze their data, create visualizations, and generate insights. This example showcases how Laminar's end-to-end platform helps you build reliable tool-calling agents. +In this guide, we'll follow the complete journey of building and improving a **Data Analysis Assistant** - an agent that helps users analyze their data, create visualizations, and generate insights. This example showcases how Laminar's end-to-end platform helps you build reliable tool-calling agents. ### **Why This Guide Matters** Tool-calling agents are powerful but complex - they need to select the right tools, use correct parameters, and handle multi-step workflows. Unlike simple text generation, evaluating these agents requires understanding their decision-making process and systematic improvement based on real user interactions. @@ -658,11 +658,11 @@ Use these insights to: - **Add New Tools**: Identify missing capabilities from user feedback - **Update Training Data**: Create more diverse evaluation cases -This approach ensures the Data Analysis Assistant continuously improves based on real user interactions and systematic evaluation, leading to more reliable and useful AI agents. +This approach ensures the Data Analysis Assistant continuously improves based on real user interactions and systematic evaluation, leading to more reliable and useful agents. ## Learn More To dive deeper into the concepts covered in this guide: -- **[Tracing Documentation](/tracing/introduction)**: Learn more about automatic instrumentation, manual span creation, and advanced tracing patterns -- **[Evaluations Documentation](/evaluations/introduction)**: Explore advanced evaluation patterns, custom evaluators, and evaluation best practices \ No newline at end of file +- **[Tracing quickstart](/tracing/quickstart)**: Learn more about automatic instrumentation, manual span creation, and advanced tracing patterns +- **[Evaluations Documentation](/evaluations/introduction)**: Explore advanced evaluation patterns, custom evaluators, and evaluation best practices diff --git a/guides/nextjs-aisdk.mdx b/guides/nextjs-aisdk.mdx index 6a32fa5..0f31779 100644 --- a/guides/nextjs-aisdk.mdx +++ b/guides/nextjs-aisdk.mdx @@ -45,7 +45,7 @@ cp .env.local.example .env.local ``` And then fill in the `.env.local` file. -Get [Laminar project API key](https://docs.lmnr.ai/tracing/introduction#2-initialize-laminar-in-your-application). +Get [Laminar project API key](https://docs.lmnr.ai/tracing/quickstart#three-commands-to-first-trace). Get [OpenAI API key](https://platform.openai.com/api-keys) diff --git a/guides/nextjs.mdx b/guides/nextjs.mdx index b107558..e99c06c 100644 --- a/guides/nextjs.mdx +++ b/guides/nextjs.mdx @@ -44,7 +44,7 @@ cp .env.local.example .env.local ``` And then fill in the `.env.local` file. -Get [Laminar project API key](https://docs.lmnr.ai/tracing/introduction#2-initialize-laminar-in-your-application). +Get [Laminar project API key](https://docs.lmnr.ai/tracing/quickstart#three-commands-to-first-trace). Get [OpenAI API key](https://platform.openai.com/api-keys). Get [Anthropic API key](https://console.anthropic.com/settings/keys). diff --git a/installation.mdx b/installation.mdx index c47d8fd..60827e4 100644 --- a/installation.mdx +++ b/installation.mdx @@ -1,112 +1,61 @@ --- title: Installation -description: Laminar installation guide +description: Copy/paste installs to get your first Laminar trace fast. --- -## Install the package +Laminar is designed to get you to a live trace in minutes. Pick your language, run the three commands, and your LLM calls will show up with inputs, outputs, tokens, and costs. - - +## What you'll do -Install the package from [npm](https://www.npmjs.com/package/@lmnr-ai/lmnr). +- Install Laminar alongside your model SDK. +- Add one import to auto-instrument. +- Run your existing code and open the trace. -```sh -npm add @lmnr-ai/lmnr -``` - - -yarn - -```sh -yarn add @lmnr-ai/lmnr -``` - -pnpm - -```sh -pnpm add @lmnr-ai/lmnr -``` - - - - -Install the package from [PyPI](https://pypi.org/project/lmnr/). - -```sh -pip install --upgrade 'lmnr[all]' -``` + +If you just want the fastest path, use the commands below. For advanced control (selective instrumentation, proxies), see [automatic instrumentation](/tracing/automatic-instrumentation). + -This will install the package and enable all the available automatic instrumentations. +## JavaScript/TypeScript -However, this installs a lot of dependencies, so you can specify the extras to enable -specific automatic instrumentations of client SDKs/libraries. +```bash +# 1) Install Laminar + OpenAI client +npm install @lmnr-ai/lmnr openai -For example, to enable automatic instrumentations of Anthropic and OpenAI, run: +# 2) One-line auto-instrumentation (import before app code) +echo "import 'lmnr/auto'" > bootstrap.ts -```sh -pip install --upgrade 'lmnr[anthropic,openai]' +# 3) Run with your Laminar key +LMNR_PROJECT_API_KEY=your_key node app.js ``` - -If you do not specify any extras, no automatic instrumentation will be enabled. - - - + +Yarn: `yarn add @lmnr-ai/lmnr openai` +pnpm: `pnpm add @lmnr-ai/lmnr openai` + -poetry +## Python -```sh -poetry add 'lmnr[anthropic,openai]' -``` +```bash +# 1) Install Laminar + OpenAI client +pip install --upgrade "lmnr[openai]" -uv +# 2) One-line auto-instrumentation +echo "import lmnr.auto" > bootstrap.py -```sh -uv add lmnr --extra anthropic --extra openai +# 3) Run with your Laminar key +LMNR_PROJECT_API_KEY=your_key python app.py ``` -uv pip + +Add extras to auto-trace specific SDKs. Examples: -```sh -uv pip install 'lmnr[anthropic,openai]' -``` +- Anthropic + OpenAI: `pip install --upgrade "lmnr[anthropic,openai]"` +- LangChain + LlamaIndex: `pip install --upgrade "lmnr[langchain,llamaindex]"` +- Vector DBs (Pinecone, Weaviate, Qdrant): `pip install --upgrade "lmnr[pinecone,weaviate,qdrant]"` +Full extras list: alephalpha, anthropic, bedrock, cohere, google-generativeai, groq, haystack, lancedb, langchain, llamaindex, marqo, milvus, mistralai, ollama, openai, pinecone, qdrant, replicate, sagemaker, together, transformers, vertexai, watsonx, weaviate. - -Full list of available extras: - -- `alephalpha` -- `anthropic` -- `bedrock` -- `cohere` -- `google-generativeai` -- `groq` -- `haystack` -- `lancedb` -- `langchain` -- `llamaindex` -- `marqo` -- `milvus` -- `mistralai` -- `ollama` -- `openai` -- `pinecone` -- `qdrant` -- `replicate` -- `sagemaker` -- `together` -- `transformers` -- `vertexai` -- `watsonx` -- `weaviate` - - - - The extras configuration is only available since version `0.4.39`. - Before that, default option would install all the available instruments. - - - - - \ No newline at end of file + +Embed placeholder: short GIF of running the three commands and opening the resulting trace with cost + tokens visible. + diff --git a/overview.mdx b/overview.mdx index 36df142..1ee84ea 100644 --- a/overview.mdx +++ b/overview.mdx @@ -3,40 +3,72 @@ title: Laminar sidebarTitle: Laminar --- -Laminar is a comprehensive **open-source platform** for observability and evaluations of AI agents. +
+

See exactly what your agents are doing

+

+ Trace every LLM call, debug browser agents, and run evaluations in minutes. Open-source, ready to self-host, with a managed cloud at laminar.sh. +

+ +
-- **Open-source** - Fully open-source and easy to self-host. Give us a ⭐ [here](https://github.com/lmnr-ai/lmnr) -- **Cloud** - Managed cloud service available at [laminar.sh](https://laminar.sh) +Laminar trace view showing LLM calls and browser session recording -## Get Started +## What you can do - - - Instrument your entire AI application and automatically trace popular AI libraries and SDKs — **OpenAI, Anthropic, Gemini, Vercel AI SDK, LangChain, Browser Use**, and more. - + + + Automatically instrument OpenAI, Anthropic, Gemini, LangChain, Browser Use, Stagehand, and more. See inputs, outputs, tokens, and cost. + + + Watch session recordings synchronized with traces so you can see exactly what the agent saw. + + + Score prompts and agents, compare variants, and track progress with traces for every datapoint. + + + Query traces, evals, and costs to build dashboards or export datasets for labeling. + + - - Measure, track, and improve your AI application performance with powerful evaluation tools. - +## Get started in 60 seconds - - Query all your data stored in Laminar using SQL for advanced analytics, custom dashboards, and dataset creation. - + + + ```bash + npm install @lmnr-ai/lmnr openai + ``` + + + ```typescript + import 'lmnr/auto'; + ``` + + + ```bash + LMNR_PROJECT_API_KEY=your_key node app.js + ``` + Open your Laminar dashboard and watch traces appear. + + - - Create and manage custom dashboards from trace and evaluation data. - - - - Experiment with prompts, test different models, and iterate on your AI application in an interactive environment. - - - - Use streamlined UI to quickly label and build datasets for evaluations from trace data and other datasets. - - - - Create and manage datasets for evaluations and other use cases. - - - +
+

Ready to see your first trace?

+

Instrument your app and view inputs, outputs, tokens, and cost in minutes.

+ + Go to quickstart → + +
diff --git a/playground/introduction.mdx b/playground/introduction.mdx index b8b6e5e..9dfd34c 100644 --- a/playground/introduction.mdx +++ b/playground/introduction.mdx @@ -1,24 +1,37 @@ --- sidebarTitle: Introduction -title: Introduction to Laminar Playground +title: Iterate faster with Laminar Playground --- -Playground is an interactive environment that allows you to experiment with AI models, test prompts, and analyze responses. +Playground lets you tweak prompts, tools, and models using real trace context without extra wiring. Open any span, reproduce it, and iterate until it is right. -## What is the Playground? + +Embed placeholder: split view GIF of opening a span in Playground, editing the prompt, and seeing a new output + tokens/cost. + -The Playground serves as a sandbox environment where you can: +## What you'll do here -- **Prompt experimentation**: Test different prompt variations and input configurations with instant results. -- [**Playground from span**](/playground/playground-from-span): Reproduce and experiment with exact configurations from any span by opening it directly in playground. -- [**Tool integration**](/playground/tools): Configure and test custom tools that models can call during conversations. -- [**Session history**](/playground/history): Access complete traces of all previous runs with full context and configurations. +- Reproduce any trace span (LLM/tool call) in one click. +- Edit prompts, models, and tool configs with instant feedback. +- Save variants and compare outputs; keep session history for later. -
- Laminar Playground Interface -
+## Fastest way to try it -To access the playground: -1. Navigate to your Laminar dashboard -2. Create playground by clicking on "New playground" -3. Choose your preferred model and start experimenting \ No newline at end of file +1. Run the [Tracing quickstart](/tracing/quickstart) to generate a trace. +2. In the Laminar UI, open that trace and click **“Open in Playground.”** +3. Edit the prompt/model/tool settings; run and compare outputs. +4. Save the best variant or push it back to your app. + +## Key features + +- **Playground from span**: jump in directly from any LLM/tool span with the original inputs and settings. +- **Tools**: configure custom tools and test tool-calling models end-to-end. +- **History**: every run is saved with inputs/outputs, tokens, and cost. +- **Shareable**: copy a link to share the exact playground state with teammates. + +## Build this next + +- Open a trace and reproduce a span → [Playground from span](/playground/playground-from-span) +- Add and test tools → [Tools](/playground/tools) +- Review previous runs → [History](/playground/history) +- Capture the trace that feeds your playground → [Tracing quickstart](/tracing/quickstart) diff --git a/projects/introduction.mdx b/projects/introduction.mdx index c8d086c..f6b137d 100644 --- a/projects/introduction.mdx +++ b/projects/introduction.mdx @@ -5,7 +5,7 @@ title: Introduction ## What is it A Laminar project is a collection of [datasets](/datasets/introduction), -[evaluations](/evaluations/introduction), and [traces](/tracing/introduction). +[evaluations](/evaluations/introduction), and [traces](/tracing/quickstart). Access [API keys](/api-reference/introduction#authentication) and environment variables are configured at the project level as well. Think about it as a scope of a certain work. A project always belong to a workspace. ## Creating a project diff --git a/queues/quickstart.mdx b/queues/quickstart.mdx index 785afc5..2175ea1 100644 --- a/queues/quickstart.mdx +++ b/queues/quickstart.mdx @@ -1,76 +1,39 @@ --- title: Labeling Queues sidebarTitle: Quickstart -description: Labeling queues are a way to quickly label and build datasets for evaluations from span data and other datasets. +description: Turn traces and datasets into labeled examples fast. --- -## What is a Labeling Queue? +Labeling queues let you turn raw outputs into labeled targets with a fast, focused UI. Push spans or dataset rows into a queue, label them, and write back to datasets for evals or training. -- A labeling queue is a collection of items that need to be labeled. -- Labeling queue is an actual queue with FIFO (first in, first out) order. -- Items in the queue have exactly the same shape as datapoints in a dataset. -- Labeling operation in this context means writing a data to the target field of a datapoint. + +Embed placeholder: GIF of pushing a span to a queue, editing the target JSON, and saving. + -
-Screenshot of a trace visualization -
+## What you'll do -## How to Use the Labeling Interface +- Queue items that share the dataset shape (`data`, `target`, `metadata`). +- Edit targets quickly with a side-by-side payload + editor. +- Save to a dataset in a single click; keep FIFO flow to stay organized. -When you open a labeling queue, you'll see a split-screen interface designed for efficient labeling: +## Fast path -### Payload view -The left panel shows you the full JSON payload of the current item you're labeling. -Payload is a JSON object with the same shape as a datapoint in a dataset. It has `data` and `target` fields. +1. From any trace span, click **“Send to labeling queue.”** Inputs become `data`; outputs become `target`. +2. Open the queue; left pane shows payload, right pane is the target editor. +3. Choose the destination dataset, click **Complete**, move to the next item. +4. Use **Skip/Prev/Next** to navigate; item counter shows progress. +## Other ways to populate queues -### Target Editor -This is where you do the actual labeling work: -- **Edit the JSON in the target editor** to correct, improve, or write new data to the target field. -- **Use proper JSON formatting** - the editor will help you with syntax highlighting +- From a dataset: select rows and push them into a queue for review. +- From SQL: export query results into a dataset, then queue them for labeling. -As you type in the target editor on the right, watch how the `"target"` section in the left payload updates in real-time. This helps you see exactly what will be saved to your dataset. +Queues preserve the dataset JSON shape, so labeled items drop straight back into evals without extra mapping. +## Build this next -### Save Preferences -- **Select your target dataset** from the dropdown to choose where completed items should go -- **Click "Complete"** to save the current item to the dataset and move to the next item in the queue. - -### Navigation -- **Check the item counter** ("Item 5 of 11") to see how many items you've completed and how many remain -- **Use the navigation buttons** to move through your queue: - - Click **"Skip"** if you want to pass on the current item without making changes - - Use **"Prev"** and **"Next"** to move between items (helpful for comparing similar cases) - - Click **"Complete"** when you're satisfied with your labeling - -## Push items to the queue - -### From Span View -You can push individual spans directly to a labeling queue for labeling. -This is particularly useful when you want to label specific model outputs for evaluation. -Span input will be added to the `data` field of the datapoint, and span output will be added to the `target` field. - -
-Screenshot of a trace visualization -
- -### From Dataset View -You can also push existing datapoints from datasets into a labeling queue. -You can either push individual datapoint or select a subset of datapoints in the dataset view. - -
-Screenshot of a trace visualization -
- - - - - - -When pushing items to a queue, they maintain the same JSON structure as datapoints in datasets, ensuring consistency between your labeling workflow and final datasets. - - - - +- Run evals on your newly labeled data → [Evaluations quickstart](/evaluations/quickstart) +- Export tricky cases via SQL → [SQL editor](/sql-editor/introduction) +- Capture the traces that feed your queues → [Tracing quickstart](/tracing/quickstart) diff --git a/snippets/Trace.mdx b/snippets/Trace.mdx new file mode 100644 index 0000000..861c86d --- /dev/null +++ b/snippets/Trace.mdx @@ -0,0 +1,58 @@ +import React, { useMemo } from "react"; + +export default function Trace({ id, traceId, spanId, host, height = 760, previewOnly = false }) { + const baseStyles = { + container: { + border: "1px solid rgba(0,0,0,0.08)", + background: "linear-gradient(145deg, rgba(0,0,0,0.02), rgba(0,0,0,0.01))", + borderRadius: 12, + color: "inherit", + fontFamily: "Inter, system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif", + boxShadow: "0 10px 40px rgba(0,0,0,0.1)", + overflow: "hidden", + }, + notice: { + padding: 14, + fontSize: 14, + color: "#4b5563", + }, + frame: { + border: "none", + width: "100%", + display: "block", + }, + }; + + const traceIdentifier = traceId || id; + const resolvedHost = useMemo(() => { + const fallback = "https://laminar.sh"; + const detected = typeof window !== "undefined" ? window.location.origin : ""; + const base = host || detected || fallback; + return base.endsWith("/") ? base.slice(0, -1) : base; + }, [host]); + + const src = traceIdentifier + ? `${resolvedHost}/shared/traces/${traceIdentifier}${spanId ? `?spanId=${spanId}` : ""}` + : ""; + + if (!traceIdentifier || previewOnly) { + return ( +
+
+ Preview your trace embed here. Provide a `traceId` (and optional `spanId`) to render the shared trace viewer. +
+
+ ); + } + + return ( +
+