Langfuse
Example requires langfuse >=v3.0.0
Setup
from langfuse import observe, get_client
from root import RootSignals
# Initialize Langfuse client using environment variables
# LANGFUSE_SECRET_KEY, LANGFUSE_PUBLIC_KEY, LANGFUSE_HOST
langfuse = get_client()
# Initialize RootSignals client
root_signals = RootSignals()
Real-Time Evaluation
Evaluate LLM responses as they are generated and automatically log scores to Langfuse.
Instrumented LLM Function
@observe(name="explain_concept_generation") # Name for traces in Langfuse UI
def explain_concept(topic: str) -> tuple[str | None, str | None]:
# Get the trace_id for the current operation, created by @observe
current_trace_id = langfuse.get_current_trace_id()
prompt = prompt_template.format(question=topic)
response_obj = client.chat.completions.create(
messages=[{"role": "user", "content": prompt}],
model="gpt-4",
)
content = response_obj.choices[0].message.content
return content, current_trace_id
Evaluation Function
def evaluate_concept(request: str, response: str, trace_id: str) -> None:
# Invoke a specific Root Signals judge
result = root_signals.judges.run(
judge_id="4d369224-dcfa-45e9-939d-075fa1dad99e",
request=request, # The input/prompt provided to the LLM
response=response, # The LLM's output to be evaluated
)
# Iterate through evaluation results and log them as Langfuse scores
for eval_result in result.evaluator_results:
langfuse.create_score(
trace_id=trace_id, # Links score to the specific Langfuse trace
name=eval_result.evaluator_name, # Name of the Root Signals evaluator (e.g., "Truthfulness")
value=eval_result.score, # Numerical score from the evaluator
comment=eval_result.justification, # Explanation for the score
)
Usage
# Generate and evaluate
response, trace_id = explain_concept("What is photosynthesis?")
evaluate_concept("What is photosynthesis?", response, trace_id)
Mapping Root Signals to Langfuse
Root Signals
Langfuse
Description in Langfuse Context
evaluator_name
name
The name of the evaluation criterion (e.g., "Hallucination," "Conciseness"). Used for identifying and filtering scores.
score
value
The numerical score assigned by the Root Signals evaluator.
justification
comment
The textual explanation from Root Signals for the score, providing qualitative insight into the evaluation
Batch Evaluation
Evaluate traces that have already been observed and stored in Langfuse. This is useful for:
Running evaluations on historical data
Batch processing evaluations on production traces
Evaluating Historical Traces
from datetime import datetime, timedelta
from langfuse import get_client
from root import RootSignals
# Initialize clients
langfuse = get_client() # uses environment variables to authenticate
root_signals = RootSignals()
if langfuse.auth_check():
print("Langfuse client is authenticated and ready!")
# Fetch latest 10 traces from the last 24 hours
traces = langfuse.api.trace.list(
limit=10,
#tags=["my-tag"], # You can filter traces by tags
from_timestamp=datetime.now() - timedelta(days=1),
).data
for trace in traces:
trace_id = trace.id
# Get all LLM generations for this trace
observations = langfuse.api.observations.get_many(
trace_id=trace_id,
type="GENERATION",
limit=100
).data
for observation in observations:
# Extract the LLM input and output
input = observation.input[0]["parts"][0]["content"]
output = observation.output[0]["parts"][0]["content"]
# Run evaluation using Root Signals judge
evaluation_result = root_signals.judges.run_by_name(
"My awesome judge I created with scorable.ai",
response=output,
request=input,
)
# Log the evaluation results back to Langfuse
for evaluator_result in evaluation_result.evaluator_results:
langfuse.create_score(
trace_id=trace_id,
name=evaluator_result.evaluator_name,
value=evaluator_result.score,
comment=evaluator_result.justification,
)
print("Evaluation complete!")

Last updated