Last modified by Robert Schaub on 2025/12/24 18:26

From version 4.1
edited by Robert Schaub
on 2025/12/24 16:55
Change comment: Imported from XAR
To version 5.1
edited by Robert Schaub
on 2025/12/24 17:59
Change comment: Imported from XAR

Summary

Details

Page properties
Title
... ... @@ -1,1 +1,1 @@
1 -POC1 API & Schemas Specification v0.4.1
1 +POC1 API & Schemas Specification
Content
... ... @@ -43,15 +43,15 @@
43 43  
44 44  {{mermaid}}
45 45  graph TD
46 - A[Article Input] --> B[Stage 1: Extract Claims]
47 - B --> C{For Each Claim}
48 - C --> D[Check Cache]
49 - D -->|Cache HIT| E[Return Cached Verdict]
50 - D -->|Cache MISS| F[Stage 2: Analyze Claim]
51 - F --> G[Store in Cache]
52 - G --> E
53 - E --> H[Stage 3: Holistic Assessment]
54 - H --> I[Final Report]
46 + A[Article Input] --> B[Stage 1: Extract Claims]
47 + B --> C{For Each Claim}
48 + C --> D[Check Cache]
49 + D -->|Cache HIT| E[Return Cached Verdict]
50 + D -->|Cache MISS| F[Stage 2: Analyze Claim]
51 + F --> G[Store in Cache]
52 + G --> E
53 + E --> H[Stage 3: Holistic Assessment]
54 + H --> I[Final Report]
55 55  {{/mermaid}}
56 56  
57 57  ==== Stage 1: Claim Extraction (Haiku, no cache) ====
... ... @@ -58,7 +58,7 @@
58 58  
59 59  * **Input:** Article text
60 60  * **Output:** 5 canonical claims (normalized, deduplicated)
61 -* **Model:** Claude Haiku 4
61 +* **Model:** Claude Haiku 4 (default, configurable via LLM abstraction layer)
62 62  * **Cost:** $0.003 per article
63 63  * **Cache strategy:** No caching (article-specific)
64 64  
... ... @@ -66,7 +66,7 @@
66 66  
67 67  * **Input:** Single canonical claim
68 68  * **Output:** Scenarios + Evidence + Verdicts
69 -* **Model:** Claude Sonnet 3.5
69 +* **Model:** Claude Sonnet 3.5 (default, configurable via LLM abstraction layer)
70 70  * **Cost:** $0.081 per NEW claim
71 71  * **Cache strategy:** Redis, 90-day TTL
72 72  * **Cache key:** claim:v1norm1:{language}:{sha256(canonical_claim)}
... ... @@ -75,10 +75,14 @@
75 75  
76 76  * **Input:** Article + Claim verdicts (from cache or Stage 2)
77 77  * **Output:** Article verdict + Fallacies + Logic quality
78 -* **Model:** Claude Sonnet 3.5
78 +* **Model:** Claude Sonnet 3.5 (default, configurable via LLM abstraction layer)
79 79  * **Cost:** $0.030 per article
80 80  * **Cache strategy:** No caching (article-specific)
81 81  
82 +
83 +
84 +**Note:** Stage 3 implements **Approach 1 (Single-Pass Holistic Analysis)** from the [[Article Verdict Problem>>Test.FactHarbor.Specification.POC.Article-Verdict-Problem]]. While claim analysis (Stage 2) is cached for efficiency, the holistic assessment maintains the integrated evaluation philosophy of Approach 1.
85 +
82 82  === Total Cost Formula: ===
83 83  
84 84  {{{Cost = $0.003 (extraction) + (N_new_claims × $0.081) + $0.030 (holistic)
... ... @@ -146,27 +146,27 @@
146 146  ==== User Experience Example: ====
147 147  
148 148  {{{{
149 - "status": "cache_only_mode",
150 - "message": "Monthly credit limit reached. Showing cached results only.",
151 - "cache_coverage": {
152 - "claims_total": 5,
153 - "claims_cached": 3,
154 - "claims_missing": 2,
155 - "coverage_percent": 60
156 - },
157 - "cached_claims": [
158 - {"claim_id": "C1", "verdict": "Likely", "confidence": 0.82},
159 - {"claim_id": "C2", "verdict": "Highly Likely", "confidence": 0.91},
160 - {"claim_id": "C4", "verdict": "Unclear", "confidence": 0.55}
161 - ],
162 - "missing_claims": [
163 - {"claim_id": "C3", "claim_text": "...", "estimated_cost": "$0.081"},
164 - {"claim_id": "C5", "claim_text": "...", "estimated_cost": "$0.081"}
165 - ],
166 - "upgrade_options": {
167 - "top_up": "$5 for 20-70 more articles",
168 - "pro_tier": "$50/month unlimited"
169 - }
153 + "status": "cache_only_mode",
154 + "message": "Monthly credit limit reached. Showing cached results only.",
155 + "cache_coverage": {
156 + "claims_total": 5,
157 + "claims_cached": 3,
158 + "claims_missing": 2,
159 + "coverage_percent": 60
160 + },
161 + "cached_claims": [
162 + {"claim_id": "C1", "verdict": "Likely", "confidence": 0.82},
163 + {"claim_id": "C2", "verdict": "Highly Likely", "confidence": 0.91},
164 + {"claim_id": "C4", "verdict": "Unclear", "confidence": 0.55}
165 + ],
166 + "missing_claims": [
167 + {"claim_id": "C3", "claim_text": "...", "estimated_cost": "$0.081"},
168 + {"claim_id": "C5", "claim_text": "...", "estimated_cost": "$0.081"}
169 + ],
170 + "upgrade_options": {
171 + "top_up": "$5 for 20-70 more articles",
172 + "pro_tier": "$50/month unlimited"
173 + }
170 170  }
171 171  }}}
172 172  
... ... @@ -179,6 +179,328 @@
179 179  
180 180  ----
181 181  
186 +
187 +
188 +== 6. LLM Abstraction Layer ==
189 +
190 +=== 6.1 Design Principle ===
191 +
192 +**FactHarbor uses provider-agnostic LLM abstraction** to avoid vendor lock-in and enable:
193 +
194 +* **Provider switching:** Change LLM providers without code changes
195 +* **Cost optimization:** Use different providers for different stages
196 +* **Resilience:** Automatic fallback if primary provider fails
197 +* **Cross-checking:** Compare outputs from multiple providers
198 +* **A/B testing:** Test new models without deployment changes
199 +
200 +**Implementation:** All LLM calls go through an abstraction layer that routes to configured providers.
201 +
202 +----
203 +
204 +=== 6.2 LLM Provider Interface ===
205 +
206 +**Abstract Interface:**
207 +
208 +{{{
209 +interface LLMProvider {
210 + // Core methods
211 + complete(prompt: string, options: CompletionOptions): Promise<CompletionResponse>
212 + stream(prompt: string, options: CompletionOptions): AsyncIterator<StreamChunk>
213 +
214 + // Provider metadata
215 + getName(): string
216 + getMaxTokens(): number
217 + getCostPer1kTokens(): { input: number, output: number }
218 +
219 + // Health check
220 + isAvailable(): Promise<boolean>
221 +}
222 +
223 +interface CompletionOptions {
224 + model?: string
225 + maxTokens?: number
226 + temperature?: number
227 + stopSequences?: string[]
228 + systemPrompt?: string
229 +}
230 +}}}
231 +
232 +----
233 +
234 +=== 6.3 Supported Providers (POC1) ===
235 +
236 +**Primary Provider (Default):**
237 +
238 +* **Anthropic Claude API**
239 + * Models: Claude Haiku 4, Claude Sonnet 3.5, Claude Opus 4
240 + * Used by default in POC1
241 + * Best quality for holistic analysis
242 +
243 +**Secondary Providers (Future):**
244 +
245 +* **OpenAI API**
246 + * Models: GPT-4o, GPT-4o-mini
247 + * For cost comparison
248 +
249 +* **Google Vertex AI**
250 + * Models: Gemini 1.5 Pro, Gemini 1.5 Flash
251 + * For diversity in evidence gathering
252 +
253 +* **Local Models** (Post-POC)
254 + * Models: Llama 3.1, Mistral
255 + * For privacy-sensitive deployments
256 +
257 +----
258 +
259 +=== 6.4 Provider Configuration ===
260 +
261 +**Environment Variables:**
262 +
263 +{{{
264 +# Primary provider
265 +LLM_PRIMARY_PROVIDER=anthropic
266 +ANTHROPIC_API_KEY=sk-ant-...
267 +
268 +# Fallback provider
269 +LLM_FALLBACK_PROVIDER=openai
270 +OPENAI_API_KEY=sk-...
271 +
272 +# Provider selection per stage
273 +LLM_STAGE1_PROVIDER=anthropic
274 +LLM_STAGE1_MODEL=claude-haiku-4
275 +LLM_STAGE2_PROVIDER=anthropic
276 +LLM_STAGE2_MODEL=claude-sonnet-3-5
277 +LLM_STAGE3_PROVIDER=anthropic
278 +LLM_STAGE3_MODEL=claude-sonnet-3-5
279 +
280 +# Cost limits
281 +LLM_MAX_COST_PER_REQUEST=1.00
282 +}}}
283 +
284 +**Database Configuration (Alternative):**
285 +
286 +{{{{
287 +{
288 + "providers": [
289 + {
290 + "name": "anthropic",
291 + "api_key_ref": "vault://anthropic-api-key",
292 + "enabled": true,
293 + "priority": 1
294 + },
295 + {
296 + "name": "openai",
297 + "api_key_ref": "vault://openai-api-key",
298 + "enabled": true,
299 + "priority": 2
300 + }
301 + ],
302 + "stage_config": {
303 + "stage1": {
304 + "provider": "anthropic",
305 + "model": "claude-haiku-4",
306 + "max_tokens": 4096,
307 + "temperature": 0.0
308 + },
309 + "stage2": {
310 + "provider": "anthropic",
311 + "model": "claude-sonnet-3-5",
312 + "max_tokens": 16384,
313 + "temperature": 0.3
314 + },
315 + "stage3": {
316 + "provider": "anthropic",
317 + "model": "claude-sonnet-3-5",
318 + "max_tokens": 8192,
319 + "temperature": 0.2
320 + }
321 + }
322 +}
323 +}}}
324 +
325 +----
326 +
327 +=== 6.5 Stage-Specific Models (POC1 Defaults) ===
328 +
329 +**Stage 1: Claim Extraction**
330 +
331 +* **Default:** Anthropic Claude Haiku 4
332 +* **Alternative:** OpenAI GPT-4o-mini, Google Gemini 1.5 Flash
333 +* **Rationale:** Fast, cheap, simple task
334 +* **Cost:** ~$0.003 per article
335 +
336 +**Stage 2: Claim Analysis** (CACHEABLE)
337 +
338 +* **Default:** Anthropic Claude Sonnet 3.5
339 +* **Alternative:** OpenAI GPT-4o, Google Gemini 1.5 Pro
340 +* **Rationale:** High-quality analysis, cached 90 days
341 +* **Cost:** ~$0.081 per NEW claim
342 +
343 +**Stage 3: Holistic Assessment**
344 +
345 +* **Default:** Anthropic Claude Sonnet 3.5
346 +* **Alternative:** OpenAI GPT-4o, Claude Opus 4 (for high-stakes)
347 +* **Rationale:** Complex reasoning, logical fallacy detection
348 +* **Cost:** ~$0.030 per article
349 +
350 +**Cost Comparison (Example):**
351 +
352 +|=Stage|=Anthropic (Default)|=OpenAI Alternative|=Google Alternative
353 +|Stage 1|Claude Haiku 4 ($0.003)|GPT-4o-mini ($0.002)|Gemini Flash ($0.002)
354 +|Stage 2|Claude Sonnet 3.5 ($0.081)|GPT-4o ($0.045)|Gemini Pro ($0.050)
355 +|Stage 3|Claude Sonnet 3.5 ($0.030)|GPT-4o ($0.018)|Gemini Pro ($0.020)
356 +|**Total (0% cache)**|**$0.114**|**$0.065**|**$0.072**
357 +
358 +**Note:** POC1 uses Anthropic exclusively for consistency. Multi-provider support planned for POC2.
359 +
360 +----
361 +
362 +=== 6.6 Failover Strategy ===
363 +
364 +**Automatic Failover:**
365 +
366 +{{{
367 +async function completeLLM(stage: string, prompt: string): Promise<string> {
368 + const primaryProvider = getProviderForStage(stage)
369 + const fallbackProvider = getFallbackProvider()
370 +
371 + try {
372 + return await primaryProvider.complete(prompt)
373 + } catch (error) {
374 + if (error.type === 'rate_limit' || error.type === 'service_unavailable') {
375 + logger.warn(`Primary provider failed, using fallback`)
376 + return await fallbackProvider.complete(prompt)
377 + }
378 + throw error
379 + }
380 +}
381 +}}}
382 +
383 +**Fallback Priority:**
384 +
385 +1. **Primary:** Configured provider for stage
386 +2. **Secondary:** Fallback provider (if configured)
387 +3. **Cache:** Return cached result (if available for Stage 2)
388 +4. **Error:** Return 503 Service Unavailable
389 +
390 +----
391 +
392 +=== 6.7 Provider Selection API ===
393 +
394 +**Admin Endpoint:** POST /admin/v1/llm/configure
395 +
396 +**Update provider for specific stage:**
397 +
398 +{{{{
399 +{
400 + "stage": "stage2",
401 + "provider": "openai",
402 + "model": "gpt-4o",
403 + "max_tokens": 16384,
404 + "temperature": 0.3
405 +}
406 +}}}
407 +
408 +**Response:** 200 OK
409 +
410 +{{{{
411 +{
412 + "message": "LLM configuration updated",
413 + "stage": "stage2",
414 + "previous": {
415 + "provider": "anthropic",
416 + "model": "claude-sonnet-3-5"
417 + },
418 + "current": {
419 + "provider": "openai",
420 + "model": "gpt-4o"
421 + },
422 + "cost_impact": {
423 + "previous_cost_per_claim": 0.081,
424 + "new_cost_per_claim": 0.045,
425 + "savings_percent": 44
426 + }
427 +}
428 +}}}
429 +
430 +**Get current configuration:**
431 +
432 +GET /admin/v1/llm/config
433 +
434 +{{{{
435 +{
436 + "providers": ["anthropic", "openai"],
437 + "primary": "anthropic",
438 + "fallback": "openai",
439 + "stages": {
440 + "stage1": {
441 + "provider": "anthropic",
442 + "model": "claude-haiku-4",
443 + "cost_per_request": 0.003
444 + },
445 + "stage2": {
446 + "provider": "anthropic",
447 + "model": "claude-sonnet-3-5",
448 + "cost_per_new_claim": 0.081
449 + },
450 + "stage3": {
451 + "provider": "anthropic",
452 + "model": "claude-sonnet-3-5",
453 + "cost_per_request": 0.030
454 + }
455 + }
456 +}
457 +}}}
458 +
459 +----
460 +
461 +=== 6.8 Implementation Notes ===
462 +
463 +**Provider Adapter Pattern:**
464 +
465 +{{{
466 +class AnthropicProvider implements LLMProvider {
467 + async complete(prompt: string, options: CompletionOptions) {
468 + const response = await anthropic.messages.create({
469 + model: options.model || 'claude-sonnet-3-5',
470 + max_tokens: options.maxTokens || 4096,
471 + messages: [{ role: 'user', content: prompt }],
472 + system: options.systemPrompt
473 + })
474 + return response.content[0].text
475 + }
476 +}
477 +
478 +class OpenAIProvider implements LLMProvider {
479 + async complete(prompt: string, options: CompletionOptions) {
480 + const response = await openai.chat.completions.create({
481 + model: options.model || 'gpt-4o',
482 + max_tokens: options.maxTokens || 4096,
483 + messages: [
484 + { role: 'system', content: options.systemPrompt },
485 + { role: 'user', content: prompt }
486 + ]
487 + })
488 + return response.choices[0].message.content
489 + }
490 +}
491 +}}}
492 +
493 +**Provider Registry:**
494 +
495 +{{{
496 +const providers = new Map<string, LLMProvider>()
497 +providers.set('anthropic', new AnthropicProvider())
498 +providers.set('openai', new OpenAIProvider())
499 +providers.set('google', new GoogleProvider())
500 +
501 +function getProvider(name: string): LLMProvider {
502 + return providers.get(name) || providers.get(config.primaryProvider)
503 +}
504 +}}}
505 +
506 +----
507 +
182 182  == 3. REST API Contract ==
183 183  
184 184  === 3.1 User Credit Tracking ===
... ... @@ -188,19 +188,19 @@
188 188  **Response:** 200 OK
189 189  
190 190  {{{{
191 - "user_id": "user_abc123",
192 - "tier": "free",
193 - "credit_limit": 10.00,
194 - "credit_used": 7.42,
195 - "credit_remaining": 2.58,
196 - "reset_date": "2025-02-01T00:00:00Z",
197 - "cache_only_mode": false,
198 - "usage_stats": {
199 - "articles_analyzed": 67,
200 - "claims_from_cache": 189,
201 - "claims_newly_analyzed": 113,
202 - "cache_hit_rate": 0.626
203 - }
517 + "user_id": "user_abc123",
518 + "tier": "free",
519 + "credit_limit": 10.00,
520 + "credit_used": 7.42,
521 + "credit_remaining": 2.58,
522 + "reset_date": "2025-02-01T00:00:00Z",
523 + "cache_only_mode": false,
524 + "usage_stats": {
525 + "articles_analyzed": 67,
526 + "claims_from_cache": 189,
527 + "claims_newly_analyzed": 113,
528 + "cache_hit_rate": 0.626
529 + }
204 204  }
205 205  }}}
206 206  
... ... @@ -221,11 +221,11 @@
221 221  OR use the client.request_id field:
222 222  
223 223  {{{{
224 - "input_url": "...",
225 - "client": {
226 - "request_id": "client-uuid-12345",
227 - "source_label": "optional"
228 - }
550 + "input_url": "...",
551 + "client": {
552 + "request_id": "client-uuid-12345",
553 + "source_label": "optional"
554 + }
229 229  }
230 230  }}}
231 231  
... ... @@ -239,11 +239,11 @@
239 239  **Example Response (Idempotent):**
240 240  
241 241  {{{{
242 - "job_id": "01J...ULID",
243 - "status": "RUNNING",
244 - "idempotent": true,
245 - "original_request_at": "2025-12-24T10:31:00Z",
246 - "message": "Returning existing job (idempotency key matched)"
568 + "job_id": "01J...ULID",
569 + "status": "RUNNING",
570 + "idempotent": true,
571 + "original_request_at": "2025-12-24T10:31:00Z",
572 + "message": "Returning existing job (idempotency key matched)"
247 247  }
248 248  }}}
249 249  
... ... @@ -250,21 +250,21 @@
250 250  ==== Request Body: ====
251 251  
252 252  {{{{
253 - "input_type": "url",
254 - "input_url": "https://example.com/medical-report-01",
255 - "input_text": null,
256 - "options": {
257 - "browsing": "on",
258 - "depth": "standard",
259 - "max_claims": 5,
260 - "scenarios_per_claim": 2,
261 - "max_evidence_per_scenario": 6,
262 - "context_aware_analysis": true
263 - },
264 - "client": {
265 - "request_id": "optional-client-tracking-id",
266 - "source_label": "optional"
267 - }
579 + "input_type": "url",
580 + "input_url": "https://example.com/medical-report-01",
581 + "input_text": null,
582 + "options": {
583 + "browsing": "on",
584 + "depth": "standard",
585 + "max_claims": 5,
586 + "scenarios_per_claim": 2,
587 + "max_evidence_per_scenario": 6,
588 + "context_aware_analysis": true
589 + },
590 + "client": {
591 + "request_id": "optional-client-tracking-id",
592 + "source_label": "optional"
593 + }
268 268  }
269 269  }}}
270 270  
... ... @@ -280,27 +280,27 @@
280 280  **Response:** 202 Accepted
281 281  
282 282  {{{{
283 - "job_id": "01J...ULID",
284 - "status": "QUEUED",
285 - "created_at": "2025-12-24T10:31:00Z",
286 - "estimated_cost": 0.114,
287 - "cost_breakdown": {
288 - "stage1_extraction": 0.003,
289 - "stage2_new_claims": 0.081,
290 - "stage2_cached_claims": 0.000,
291 - "stage3_holistic": 0.030
292 - },
293 - "cache_info": {
294 - "claims_to_extract": 5,
295 - "estimated_cache_hits": 4,
296 - "estimated_new_claims": 1
297 - },
298 - "links": {
299 - "self": "/v1/jobs/01J...ULID",
300 - "result": "/v1/jobs/01J...ULID/result",
301 - "report": "/v1/jobs/01J...ULID/report",
302 - "events": "/v1/jobs/01J...ULID/events"
303 - }
609 + "job_id": "01J...ULID",
610 + "status": "QUEUED",
611 + "created_at": "2025-12-24T10:31:00Z",
612 + "estimated_cost": 0.114,
613 + "cost_breakdown": {
614 + "stage1_extraction": 0.003,
615 + "stage2_new_claims": 0.081,
616 + "stage2_cached_claims": 0.000,
617 + "stage3_holistic": 0.030
618 + },
619 + "cache_info": {
620 + "claims_to_extract": 5,
621 + "estimated_cache_hits": 4,
622 + "estimated_new_claims": 1
623 + },
624 + "links": {
625 + "self": "/v1/jobs/01J...ULID",
626 + "result": "/v1/jobs/01J...ULID/result",
627 + "report": "/v1/jobs/01J...ULID/report",
628 + "events": "/v1/jobs/01J...ULID/events"
629 + }
304 304  }
305 305  }}}
306 306  
... ... @@ -309,12 +309,12 @@
309 309  402 Payment Required - Free tier limit reached, cache-only mode
310 310  
311 311  {{{{
312 - "error": "credit_limit_reached",
313 - "message": "Monthly credit limit reached. Entering cache-only mode.",
314 - "cache_only_mode": true,
315 - "credit_remaining": 0.00,
316 - "reset_date": "2025-02-01T00:00:00Z",
317 - "action": "Resubmit with cache_preference=allow_partial for cached results"
638 + "error": "credit_limit_reached",
639 + "message": "Monthly credit limit reached. Entering cache-only mode.",
640 + "cache_only_mode": true,
641 + "credit_remaining": 0.00,
642 + "reset_date": "2025-02-01T00:00:00Z",
643 + "action": "Resubmit with cache_preference=allow_partial for cached results"
318 318  }
319 319  }}}
320 320  
... ... @@ -325,29 +325,29 @@
325 325  === 4.1 Stage 1 Output: ClaimExtraction ===
326 326  
327 327  {{{{
328 - "job_id": "01J...ULID",
329 - "stage": "stage1_extraction",
330 - "article_metadata": {
331 - "title": "Article title",
332 - "source_url": "https://example.com/article",
333 - "extracted_text_length": 5234,
334 - "language": "en"
335 - },
336 - "claims": [
337 - {
338 - "claim_id": "C1",
339 - "claim_text": "Original claim text from article",
340 - "canonical_claim": "Normalized, deduplicated phrasing",
341 - "claim_hash": "sha256:abc123...",
342 - "is_central_to_thesis": true,
343 - "claim_type": "causal",
344 - "evaluability": "evaluable",
345 - "risk_tier": "B",
346 - "domain": "public_health"
347 - }
348 - ],
349 - "article_thesis": "Main argument detected",
350 - "cost": 0.003
654 + "job_id": "01J...ULID",
655 + "stage": "stage1_extraction",
656 + "article_metadata": {
657 + "title": "Article title",
658 + "source_url": "https://example.com/article",
659 + "extracted_text_length": 5234,
660 + "language": "en"
661 + },
662 + "claims": [
663 + {
664 + "claim_id": "C1",
665 + "claim_text": "Original claim text from article",
666 + "canonical_claim": "Normalized, deduplicated phrasing",
667 + "claim_hash": "sha256:abc123...",
668 + "is_central_to_thesis": true,
669 + "claim_type": "causal",
670 + "evaluability": "evaluable",
671 + "risk_tier": "B",
672 + "domain": "public_health"
673 + }
674 + ],
675 + "article_thesis": "Main argument detected",
676 + "cost": 0.003
351 351  }
352 352  }}}
353 353  
... ... @@ -433,7 +433,7 @@
433 433  **Data Structure:**
434 434  
435 435  {{{SET claim:v1norm1:en:abc123...def456 '{...ClaimAnalysis JSON...}'
436 -EXPIRE claim:v1norm1:en:abc123...def456 7776000 # 90 days
762 +EXPIRE claim:v1norm1:en:abc123...def456 7776000 # 90 days
437 437  }}}
438 438  
439 439  ----
... ... @@ -445,44 +445,44 @@
445 445  **Algorithm: Canonical Claim Normalization v1**
446 446  
447 447  {{{def normalize_claim_v1(claim_text: str, language: str) -> str:
448 - """
449 - Normalizes claim to canonical form for cache key generation.
450 - Version: v1norm1 (POC1)
451 - """
452 - import re
453 - import unicodedata
454 -
455 - # Step 1: Unicode normalization (NFC)
456 - text = unicodedata.normalize('NFC', claim_text)
457 -
458 - # Step 2: Lowercase
459 - text = text.lower()
460 -
461 - # Step 3: Remove punctuation (except hyphens in words)
462 - text = re.sub(r'[^\w\s-]', '', text)
463 -
464 - # Step 4: Normalize whitespace (collapse multiple spaces)
465 - text = re.sub(r'\s+', ' ', text).strip()
466 -
467 - # Step 5: Numeric normalization
468 - text = text.replace('%', ' percent')
469 - # Spell out single-digit numbers
470 - num_to_word = {'0':'zero', '1':'one', '2':'two', '3':'three',
471 - '4':'four', '5':'five', '6':'six', '7':'seven',
472 - '8':'eight', '9':'nine'}
473 - for num, word in num_to_word.items():
474 - text = re.sub(rf'\b{num}\b', word, text)
475 -
476 - # Step 6: Common abbreviations (English only in v1)
477 - if language == 'en':
478 - text = text.replace('covid-19', 'covid')
479 - text = text.replace('u.s.', 'us')
480 - text = text.replace('u.k.', 'uk')
481 -
482 - # Step 7: NO entity normalization in v1
483 - # (Trump vs Donald Trump vs President Trump remain distinct)
484 -
485 - return text
774 + """
775 + Normalizes claim to canonical form for cache key generation.
776 + Version: v1norm1 (POC1)
777 + """
778 + import re
779 + import unicodedata
780 +
781 + # Step 1: Unicode normalization (NFC)
782 + text = unicodedata.normalize('NFC', claim_text)
783 +
784 + # Step 2: Lowercase
785 + text = text.lower()
786 +
787 + # Step 3: Remove punctuation (except hyphens in words)
788 + text = re.sub(r'[^\w\s-]', '', text)
789 +
790 + # Step 4: Normalize whitespace (collapse multiple spaces)
791 + text = re.sub(r'\s+', ' ', text).strip()
792 +
793 + # Step 5: Numeric normalization
794 + text = text.replace('%', ' percent')
795 + # Spell out single-digit numbers
796 + num_to_word = {'0':'zero', '1':'one', '2':'two', '3':'three',
797 + '4':'four', '5':'five', '6':'six', '7':'seven',
798 + '8':'eight', '9':'nine'}
799 + for num, word in num_to_word.items():
800 + text = re.sub(rf'\b{num}\b', word, text)
801 +
802 + # Step 6: Common abbreviations (English only in v1)
803 + if language == 'en':
804 + text = text.replace('covid-19', 'covid')
805 + text = text.replace('u.s.', 'us')
806 + text = text.replace('u.k.', 'uk')
807 +
808 + # Step 7: NO entity normalization in v1
809 + # (Trump vs Donald Trump vs President Trump remain distinct)
810 +
811 + return text
486 486  
487 487  # Version identifier (include in cache namespace)
488 488  CANONICALIZER_VERSION = "v1norm1"
... ... @@ -495,19 +495,19 @@
495 495  cache_key = f"claim:{CANONICALIZER_VERSION}:{language}:{sha256(canonical)}"
496 496  
497 497  Example:
498 - claim: "COVID-19 vaccines are 95% effective"
499 - canonical: "covid vaccines are 95 percent effective"
500 - sha256: abc123...def456
501 - key: "claim:v1norm1:en:abc123...def456"
824 + claim: "COVID-19 vaccines are 95% effective"
825 + canonical: "covid vaccines are 95 percent effective"
826 + sha256: abc123...def456
827 + key: "claim:v1norm1:en:abc123...def456"
502 502  }}}
503 503  
504 504  **Cache Metadata MUST Include:**
505 505  
506 506  {{{{
507 - "canonical_claim": "covid vaccines are 95 percent effective",
508 - "canonicalizer_version": "v1norm1",
509 - "language": "en",
510 - "original_claim_samples": ["COVID-19 vaccines are 95% effective"]
833 + "canonical_claim": "covid vaccines are 95 percent effective",
834 + "canonicalizer_version": "v1norm1",
835 + "language": "en",
836 + "original_claim_samples": ["COVID-19 vaccines are 95% effective"]
511 511  }
512 512  }}}
513 513