Last modified by Robert Schaub on 2025/12/24 18:26

From version 5.2
edited by Robert Schaub
on 2025/12/24 18:26
Change comment: Update document after refactoring.
To version 2.2
edited by Robert Schaub
on 2025/12/24 16:28
Change comment: There is no comment for this version

Summary

Details

Page properties
Parent
... ... @@ -1,1 +1,1 @@
1 -Test.FactHarbor V0\.9\.103.Specification.POC.WebHome
1 +Test.FactHarbor.Specification.POC.WebHome
Content
... ... @@ -41,24 +41,23 @@
41 41  
42 42  FactHarbor POC1 uses a **3-stage architecture** designed for claim-level caching and cost efficiency:
43 43  
44 -{{mermaid}}
45 -graph TD
46 - A[Article Input] --> B[Stage 1: Extract Claims]
47 - B --> C{For Each Claim}
48 - C --> D[Check Cache]
49 - D -->|Cache HIT| E[Return Cached Verdict]
50 - D -->|Cache MISS| F[Stage 2: Analyze Claim]
51 - F --> G[Store in Cache]
52 - G --> E
53 - E --> H[Stage 3: Holistic Assessment]
54 - H --> I[Final Report]
55 -{{/mermaid}}
44 +{{{graph TD
45 + A[Article Input] --> B[Stage 1: Extract Claims]
46 + B --> C{For Each Claim}
47 + C --> D[Check Cache]
48 + D -->|Cache HIT| E[Return Cached Verdict]
49 + D -->|Cache MISS| F[Stage 2: Analyze Claim]
50 + F --> G[Store in Cache]
51 + G --> E
52 + E --> H[Stage 3: Holistic Assessment]
53 + H --> I[Final Report]
54 +}}}
56 56  
57 57  ==== Stage 1: Claim Extraction (Haiku, no cache) ====
58 58  
59 59  * **Input:** Article text
60 60  * **Output:** 5 canonical claims (normalized, deduplicated)
61 -* **Model:** Claude Haiku 4 (default, configurable via LLM abstraction layer)
60 +* **Model:** Claude Haiku 4
62 62  * **Cost:** $0.003 per article
63 63  * **Cache strategy:** No caching (article-specific)
64 64  
... ... @@ -66,7 +66,7 @@
66 66  
67 67  * **Input:** Single canonical claim
68 68  * **Output:** Scenarios + Evidence + Verdicts
69 -* **Model:** Claude Sonnet 3.5 (default, configurable via LLM abstraction layer)
68 +* **Model:** Claude Sonnet 3.5
70 70  * **Cost:** $0.081 per NEW claim
71 71  * **Cache strategy:** Redis, 90-day TTL
72 72  * **Cache key:** claim:v1norm1:{language}:{sha256(canonical_claim)}
... ... @@ -75,14 +75,10 @@
75 75  
76 76  * **Input:** Article + Claim verdicts (from cache or Stage 2)
77 77  * **Output:** Article verdict + Fallacies + Logic quality
78 -* **Model:** Claude Sonnet 3.5 (default, configurable via LLM abstraction layer)
77 +* **Model:** Claude Sonnet 3.5
79 79  * **Cost:** $0.030 per article
80 80  * **Cache strategy:** No caching (article-specific)
81 81  
82 -
83 -
84 -**Note:** Stage 3 implements **Approach 1 (Single-Pass Holistic Analysis)** from the [[Article Verdict Problem>>Test.FactHarbor.Specification.POC.Article-Verdict-Problem]]. While claim analysis (Stage 2) is cached for efficiency, the holistic assessment maintains the integrated evaluation philosophy of Approach 1.
85 -
86 86  === Total Cost Formula: ===
87 87  
88 88  {{{Cost = $0.003 (extraction) + (N_new_claims × $0.081) + $0.030 (holistic)
... ... @@ -150,27 +150,27 @@
150 150  ==== User Experience Example: ====
151 151  
152 152  {{{{
153 - "status": "cache_only_mode",
154 - "message": "Monthly credit limit reached. Showing cached results only.",
155 - "cache_coverage": {
156 - "claims_total": 5,
157 - "claims_cached": 3,
158 - "claims_missing": 2,
159 - "coverage_percent": 60
160 - },
161 - "cached_claims": [
162 - {"claim_id": "C1", "verdict": "Likely", "confidence": 0.82},
163 - {"claim_id": "C2", "verdict": "Highly Likely", "confidence": 0.91},
164 - {"claim_id": "C4", "verdict": "Unclear", "confidence": 0.55}
165 - ],
166 - "missing_claims": [
167 - {"claim_id": "C3", "claim_text": "...", "estimated_cost": "$0.081"},
168 - {"claim_id": "C5", "claim_text": "...", "estimated_cost": "$0.081"}
169 - ],
170 - "upgrade_options": {
171 - "top_up": "$5 for 20-70 more articles",
172 - "pro_tier": "$50/month unlimited"
173 - }
148 + "status": "cache_only_mode",
149 + "message": "Monthly credit limit reached. Showing cached results only.",
150 + "cache_coverage": {
151 + "claims_total": 5,
152 + "claims_cached": 3,
153 + "claims_missing": 2,
154 + "coverage_percent": 60
155 + },
156 + "cached_claims": [
157 + {"claim_id": "C1", "verdict": "Likely", "confidence": 0.82},
158 + {"claim_id": "C2", "verdict": "Highly Likely", "confidence": 0.91},
159 + {"claim_id": "C4", "verdict": "Unclear", "confidence": 0.55}
160 + ],
161 + "missing_claims": [
162 + {"claim_id": "C3", "claim_text": "...", "estimated_cost": "$0.081"},
163 + {"claim_id": "C5", "claim_text": "...", "estimated_cost": "$0.081"}
164 + ],
165 + "upgrade_options": {
166 + "top_up": "$5 for 20-70 more articles",
167 + "pro_tier": "$50/month unlimited"
168 + }
174 174  }
175 175  }}}
176 176  
... ... @@ -183,328 +183,6 @@
183 183  
184 184  ----
185 185  
186 -
187 -
188 -== 6. LLM Abstraction Layer ==
189 -
190 -=== 6.1 Design Principle ===
191 -
192 -**FactHarbor uses provider-agnostic LLM abstraction** to avoid vendor lock-in and enable:
193 -
194 -* **Provider switching:** Change LLM providers without code changes
195 -* **Cost optimization:** Use different providers for different stages
196 -* **Resilience:** Automatic fallback if primary provider fails
197 -* **Cross-checking:** Compare outputs from multiple providers
198 -* **A/B testing:** Test new models without deployment changes
199 -
200 -**Implementation:** All LLM calls go through an abstraction layer that routes to configured providers.
201 -
202 -----
203 -
204 -=== 6.2 LLM Provider Interface ===
205 -
206 -**Abstract Interface:**
207 -
208 -{{{
209 -interface LLMProvider {
210 - // Core methods
211 - complete(prompt: string, options: CompletionOptions): Promise<CompletionResponse>
212 - stream(prompt: string, options: CompletionOptions): AsyncIterator<StreamChunk>
213 -
214 - // Provider metadata
215 - getName(): string
216 - getMaxTokens(): number
217 - getCostPer1kTokens(): { input: number, output: number }
218 -
219 - // Health check
220 - isAvailable(): Promise<boolean>
221 -}
222 -
223 -interface CompletionOptions {
224 - model?: string
225 - maxTokens?: number
226 - temperature?: number
227 - stopSequences?: string[]
228 - systemPrompt?: string
229 -}
230 -}}}
231 -
232 -----
233 -
234 -=== 6.3 Supported Providers (POC1) ===
235 -
236 -**Primary Provider (Default):**
237 -
238 -* **Anthropic Claude API**
239 - * Models: Claude Haiku 4, Claude Sonnet 3.5, Claude Opus 4
240 - * Used by default in POC1
241 - * Best quality for holistic analysis
242 -
243 -**Secondary Providers (Future):**
244 -
245 -* **OpenAI API**
246 - * Models: GPT-4o, GPT-4o-mini
247 - * For cost comparison
248 -
249 -* **Google Vertex AI**
250 - * Models: Gemini 1.5 Pro, Gemini 1.5 Flash
251 - * For diversity in evidence gathering
252 -
253 -* **Local Models** (Post-POC)
254 - * Models: Llama 3.1, Mistral
255 - * For privacy-sensitive deployments
256 -
257 -----
258 -
259 -=== 6.4 Provider Configuration ===
260 -
261 -**Environment Variables:**
262 -
263 -{{{
264 -# Primary provider
265 -LLM_PRIMARY_PROVIDER=anthropic
266 -ANTHROPIC_API_KEY=sk-ant-...
267 -
268 -# Fallback provider
269 -LLM_FALLBACK_PROVIDER=openai
270 -OPENAI_API_KEY=sk-...
271 -
272 -# Provider selection per stage
273 -LLM_STAGE1_PROVIDER=anthropic
274 -LLM_STAGE1_MODEL=claude-haiku-4
275 -LLM_STAGE2_PROVIDER=anthropic
276 -LLM_STAGE2_MODEL=claude-sonnet-3-5
277 -LLM_STAGE3_PROVIDER=anthropic
278 -LLM_STAGE3_MODEL=claude-sonnet-3-5
279 -
280 -# Cost limits
281 -LLM_MAX_COST_PER_REQUEST=1.00
282 -}}}
283 -
284 -**Database Configuration (Alternative):**
285 -
286 -{{{{
287 -{
288 - "providers": [
289 - {
290 - "name": "anthropic",
291 - "api_key_ref": "vault://anthropic-api-key",
292 - "enabled": true,
293 - "priority": 1
294 - },
295 - {
296 - "name": "openai",
297 - "api_key_ref": "vault://openai-api-key",
298 - "enabled": true,
299 - "priority": 2
300 - }
301 - ],
302 - "stage_config": {
303 - "stage1": {
304 - "provider": "anthropic",
305 - "model": "claude-haiku-4",
306 - "max_tokens": 4096,
307 - "temperature": 0.0
308 - },
309 - "stage2": {
310 - "provider": "anthropic",
311 - "model": "claude-sonnet-3-5",
312 - "max_tokens": 16384,
313 - "temperature": 0.3
314 - },
315 - "stage3": {
316 - "provider": "anthropic",
317 - "model": "claude-sonnet-3-5",
318 - "max_tokens": 8192,
319 - "temperature": 0.2
320 - }
321 - }
322 -}
323 -}}}
324 -
325 -----
326 -
327 -=== 6.5 Stage-Specific Models (POC1 Defaults) ===
328 -
329 -**Stage 1: Claim Extraction**
330 -
331 -* **Default:** Anthropic Claude Haiku 4
332 -* **Alternative:** OpenAI GPT-4o-mini, Google Gemini 1.5 Flash
333 -* **Rationale:** Fast, cheap, simple task
334 -* **Cost:** ~$0.003 per article
335 -
336 -**Stage 2: Claim Analysis** (CACHEABLE)
337 -
338 -* **Default:** Anthropic Claude Sonnet 3.5
339 -* **Alternative:** OpenAI GPT-4o, Google Gemini 1.5 Pro
340 -* **Rationale:** High-quality analysis, cached 90 days
341 -* **Cost:** ~$0.081 per NEW claim
342 -
343 -**Stage 3: Holistic Assessment**
344 -
345 -* **Default:** Anthropic Claude Sonnet 3.5
346 -* **Alternative:** OpenAI GPT-4o, Claude Opus 4 (for high-stakes)
347 -* **Rationale:** Complex reasoning, logical fallacy detection
348 -* **Cost:** ~$0.030 per article
349 -
350 -**Cost Comparison (Example):**
351 -
352 -|=Stage|=Anthropic (Default)|=OpenAI Alternative|=Google Alternative
353 -|Stage 1|Claude Haiku 4 ($0.003)|GPT-4o-mini ($0.002)|Gemini Flash ($0.002)
354 -|Stage 2|Claude Sonnet 3.5 ($0.081)|GPT-4o ($0.045)|Gemini Pro ($0.050)
355 -|Stage 3|Claude Sonnet 3.5 ($0.030)|GPT-4o ($0.018)|Gemini Pro ($0.020)
356 -|**Total (0% cache)**|**$0.114**|**$0.065**|**$0.072**
357 -
358 -**Note:** POC1 uses Anthropic exclusively for consistency. Multi-provider support planned for POC2.
359 -
360 -----
361 -
362 -=== 6.6 Failover Strategy ===
363 -
364 -**Automatic Failover:**
365 -
366 -{{{
367 -async function completeLLM(stage: string, prompt: string): Promise<string> {
368 - const primaryProvider = getProviderForStage(stage)
369 - const fallbackProvider = getFallbackProvider()
370 -
371 - try {
372 - return await primaryProvider.complete(prompt)
373 - } catch (error) {
374 - if (error.type === 'rate_limit' || error.type === 'service_unavailable') {
375 - logger.warn(`Primary provider failed, using fallback`)
376 - return await fallbackProvider.complete(prompt)
377 - }
378 - throw error
379 - }
380 -}
381 -}}}
382 -
383 -**Fallback Priority:**
384 -
385 -1. **Primary:** Configured provider for stage
386 -2. **Secondary:** Fallback provider (if configured)
387 -3. **Cache:** Return cached result (if available for Stage 2)
388 -4. **Error:** Return 503 Service Unavailable
389 -
390 -----
391 -
392 -=== 6.7 Provider Selection API ===
393 -
394 -**Admin Endpoint:** POST /admin/v1/llm/configure
395 -
396 -**Update provider for specific stage:**
397 -
398 -{{{{
399 -{
400 - "stage": "stage2",
401 - "provider": "openai",
402 - "model": "gpt-4o",
403 - "max_tokens": 16384,
404 - "temperature": 0.3
405 -}
406 -}}}
407 -
408 -**Response:** 200 OK
409 -
410 -{{{{
411 -{
412 - "message": "LLM configuration updated",
413 - "stage": "stage2",
414 - "previous": {
415 - "provider": "anthropic",
416 - "model": "claude-sonnet-3-5"
417 - },
418 - "current": {
419 - "provider": "openai",
420 - "model": "gpt-4o"
421 - },
422 - "cost_impact": {
423 - "previous_cost_per_claim": 0.081,
424 - "new_cost_per_claim": 0.045,
425 - "savings_percent": 44
426 - }
427 -}
428 -}}}
429 -
430 -**Get current configuration:**
431 -
432 -GET /admin/v1/llm/config
433 -
434 -{{{{
435 -{
436 - "providers": ["anthropic", "openai"],
437 - "primary": "anthropic",
438 - "fallback": "openai",
439 - "stages": {
440 - "stage1": {
441 - "provider": "anthropic",
442 - "model": "claude-haiku-4",
443 - "cost_per_request": 0.003
444 - },
445 - "stage2": {
446 - "provider": "anthropic",
447 - "model": "claude-sonnet-3-5",
448 - "cost_per_new_claim": 0.081
449 - },
450 - "stage3": {
451 - "provider": "anthropic",
452 - "model": "claude-sonnet-3-5",
453 - "cost_per_request": 0.030
454 - }
455 - }
456 -}
457 -}}}
458 -
459 -----
460 -
461 -=== 6.8 Implementation Notes ===
462 -
463 -**Provider Adapter Pattern:**
464 -
465 -{{{
466 -class AnthropicProvider implements LLMProvider {
467 - async complete(prompt: string, options: CompletionOptions) {
468 - const response = await anthropic.messages.create({
469 - model: options.model || 'claude-sonnet-3-5',
470 - max_tokens: options.maxTokens || 4096,
471 - messages: [{ role: 'user', content: prompt }],
472 - system: options.systemPrompt
473 - })
474 - return response.content[0].text
475 - }
476 -}
477 -
478 -class OpenAIProvider implements LLMProvider {
479 - async complete(prompt: string, options: CompletionOptions) {
480 - const response = await openai.chat.completions.create({
481 - model: options.model || 'gpt-4o',
482 - max_tokens: options.maxTokens || 4096,
483 - messages: [
484 - { role: 'system', content: options.systemPrompt },
485 - { role: 'user', content: prompt }
486 - ]
487 - })
488 - return response.choices[0].message.content
489 - }
490 -}
491 -}}}
492 -
493 -**Provider Registry:**
494 -
495 -{{{
496 -const providers = new Map<string, LLMProvider>()
497 -providers.set('anthropic', new AnthropicProvider())
498 -providers.set('openai', new OpenAIProvider())
499 -providers.set('google', new GoogleProvider())
500 -
501 -function getProvider(name: string): LLMProvider {
502 - return providers.get(name) || providers.get(config.primaryProvider)
503 -}
504 -}}}
505 -
506 -----
507 -
508 508  == 3. REST API Contract ==
509 509  
510 510  === 3.1 User Credit Tracking ===
... ... @@ -514,19 +514,19 @@
514 514  **Response:** 200 OK
515 515  
516 516  {{{{
517 - "user_id": "user_abc123",
518 - "tier": "free",
519 - "credit_limit": 10.00,
520 - "credit_used": 7.42,
521 - "credit_remaining": 2.58,
522 - "reset_date": "2025-02-01T00:00:00Z",
523 - "cache_only_mode": false,
524 - "usage_stats": {
525 - "articles_analyzed": 67,
526 - "claims_from_cache": 189,
527 - "claims_newly_analyzed": 113,
528 - "cache_hit_rate": 0.626
529 - }
190 + "user_id": "user_abc123",
191 + "tier": "free",
192 + "credit_limit": 10.00,
193 + "credit_used": 7.42,
194 + "credit_remaining": 2.58,
195 + "reset_date": "2025-02-01T00:00:00Z",
196 + "cache_only_mode": false,
197 + "usage_stats": {
198 + "articles_analyzed": 67,
199 + "claims_from_cache": 189,
200 + "claims_newly_analyzed": 113,
201 + "cache_hit_rate": 0.626
202 + }
530 530  }
531 531  }}}
532 532  
... ... @@ -547,11 +547,11 @@
547 547  OR use the client.request_id field:
548 548  
549 549  {{{{
550 - "input_url": "...",
551 - "client": {
552 - "request_id": "client-uuid-12345",
553 - "source_label": "optional"
554 - }
223 + "input_url": "...",
224 + "client": {
225 + "request_id": "client-uuid-12345",
226 + "source_label": "optional"
227 + }
555 555  }
556 556  }}}
557 557  
... ... @@ -565,11 +565,11 @@
565 565  **Example Response (Idempotent):**
566 566  
567 567  {{{{
568 - "job_id": "01J...ULID",
569 - "status": "RUNNING",
570 - "idempotent": true,
571 - "original_request_at": "2025-12-24T10:31:00Z",
572 - "message": "Returning existing job (idempotency key matched)"
241 + "job_id": "01J...ULID",
242 + "status": "RUNNING",
243 + "idempotent": true,
244 + "original_request_at": "2025-12-24T10:31:00Z",
245 + "message": "Returning existing job (idempotency key matched)"
573 573  }
574 574  }}}
575 575  
... ... @@ -576,21 +576,21 @@
576 576  ==== Request Body: ====
577 577  
578 578  {{{{
579 - "input_type": "url",
580 - "input_url": "https://example.com/medical-report-01",
581 - "input_text": null,
582 - "options": {
583 - "browsing": "on",
584 - "depth": "standard",
585 - "max_claims": 5,
586 - "scenarios_per_claim": 2,
587 - "max_evidence_per_scenario": 6,
588 - "context_aware_analysis": true
589 - },
590 - "client": {
591 - "request_id": "optional-client-tracking-id",
592 - "source_label": "optional"
593 - }
252 + "input_type": "url",
253 + "input_url": "https://example.com/medical-report-01",
254 + "input_text": null,
255 + "options": {
256 + "browsing": "on",
257 + "depth": "standard",
258 + "max_claims": 5,
259 + "scenarios_per_claim": 2,
260 + "max_evidence_per_scenario": 6,
261 + "context_aware_analysis": true
262 + },
263 + "client": {
264 + "request_id": "optional-client-tracking-id",
265 + "source_label": "optional"
266 + }
594 594  }
595 595  }}}
596 596  
... ... @@ -606,27 +606,27 @@
606 606  **Response:** 202 Accepted
607 607  
608 608  {{{{
609 - "job_id": "01J...ULID",
610 - "status": "QUEUED",
611 - "created_at": "2025-12-24T10:31:00Z",
612 - "estimated_cost": 0.114,
613 - "cost_breakdown": {
614 - "stage1_extraction": 0.003,
615 - "stage2_new_claims": 0.081,
616 - "stage2_cached_claims": 0.000,
617 - "stage3_holistic": 0.030
618 - },
619 - "cache_info": {
620 - "claims_to_extract": 5,
621 - "estimated_cache_hits": 4,
622 - "estimated_new_claims": 1
623 - },
624 - "links": {
625 - "self": "/v1/jobs/01J...ULID",
626 - "result": "/v1/jobs/01J...ULID/result",
627 - "report": "/v1/jobs/01J...ULID/report",
628 - "events": "/v1/jobs/01J...ULID/events"
629 - }
282 + "job_id": "01J...ULID",
283 + "status": "QUEUED",
284 + "created_at": "2025-12-24T10:31:00Z",
285 + "estimated_cost": 0.114,
286 + "cost_breakdown": {
287 + "stage1_extraction": 0.003,
288 + "stage2_new_claims": 0.081,
289 + "stage2_cached_claims": 0.000,
290 + "stage3_holistic": 0.030
291 + },
292 + "cache_info": {
293 + "claims_to_extract": 5,
294 + "estimated_cache_hits": 4,
295 + "estimated_new_claims": 1
296 + },
297 + "links": {
298 + "self": "/v1/jobs/01J...ULID",
299 + "result": "/v1/jobs/01J...ULID/result",
300 + "report": "/v1/jobs/01J...ULID/report",
301 + "events": "/v1/jobs/01J...ULID/events"
302 + }
630 630  }
631 631  }}}
632 632  
... ... @@ -635,12 +635,12 @@
635 635  402 Payment Required - Free tier limit reached, cache-only mode
636 636  
637 637  {{{{
638 - "error": "credit_limit_reached",
639 - "message": "Monthly credit limit reached. Entering cache-only mode.",
640 - "cache_only_mode": true,
641 - "credit_remaining": 0.00,
642 - "reset_date": "2025-02-01T00:00:00Z",
643 - "action": "Resubmit with cache_preference=allow_partial for cached results"
311 + "error": "credit_limit_reached",
312 + "message": "Monthly credit limit reached. Entering cache-only mode.",
313 + "cache_only_mode": true,
314 + "credit_remaining": 0.00,
315 + "reset_date": "2025-02-01T00:00:00Z",
316 + "action": "Resubmit with cache_preference=allow_partial for cached results"
644 644  }
645 645  }}}
646 646  
... ... @@ -651,29 +651,29 @@
651 651  === 4.1 Stage 1 Output: ClaimExtraction ===
652 652  
653 653  {{{{
654 - "job_id": "01J...ULID",
655 - "stage": "stage1_extraction",
656 - "article_metadata": {
657 - "title": "Article title",
658 - "source_url": "https://example.com/article",
659 - "extracted_text_length": 5234,
660 - "language": "en"
661 - },
662 - "claims": [
663 - {
664 - "claim_id": "C1",
665 - "claim_text": "Original claim text from article",
666 - "canonical_claim": "Normalized, deduplicated phrasing",
667 - "claim_hash": "sha256:abc123...",
668 - "is_central_to_thesis": true,
669 - "claim_type": "causal",
670 - "evaluability": "evaluable",
671 - "risk_tier": "B",
672 - "domain": "public_health"
673 - }
674 - ],
675 - "article_thesis": "Main argument detected",
676 - "cost": 0.003
327 + "job_id": "01J...ULID",
328 + "stage": "stage1_extraction",
329 + "article_metadata": {
330 + "title": "Article title",
331 + "source_url": "https://example.com/article",
332 + "extracted_text_length": 5234,
333 + "language": "en"
334 + },
335 + "claims": [
336 + {
337 + "claim_id": "C1",
338 + "claim_text": "Original claim text from article",
339 + "canonical_claim": "Normalized, deduplicated phrasing",
340 + "claim_hash": "sha256:abc123...",
341 + "is_central_to_thesis": true,
342 + "claim_type": "causal",
343 + "evaluability": "evaluable",
344 + "risk_tier": "B",
345 + "domain": "public_health"
346 + }
347 + ],
348 + "article_thesis": "Main argument detected",
349 + "cost": 0.003
677 677  }
678 678  }}}
679 679  
... ... @@ -759,7 +759,7 @@
759 759  **Data Structure:**
760 760  
761 761  {{{SET claim:v1norm1:en:abc123...def456 '{...ClaimAnalysis JSON...}'
762 -EXPIRE claim:v1norm1:en:abc123...def456 7776000 # 90 days
435 +EXPIRE claim:v1norm1:en:abc123...def456 7776000 # 90 days
763 763  }}}
764 764  
765 765  ----
... ... @@ -771,44 +771,44 @@
771 771  **Algorithm: Canonical Claim Normalization v1**
772 772  
773 773  {{{def normalize_claim_v1(claim_text: str, language: str) -> str:
774 - """
775 - Normalizes claim to canonical form for cache key generation.
776 - Version: v1norm1 (POC1)
777 - """
778 - import re
779 - import unicodedata
780 -
781 - # Step 1: Unicode normalization (NFC)
782 - text = unicodedata.normalize('NFC', claim_text)
783 -
784 - # Step 2: Lowercase
785 - text = text.lower()
786 -
787 - # Step 3: Remove punctuation (except hyphens in words)
788 - text = re.sub(r'[^\w\s-]', '', text)
789 -
790 - # Step 4: Normalize whitespace (collapse multiple spaces)
791 - text = re.sub(r'\s+', ' ', text).strip()
792 -
793 - # Step 5: Numeric normalization
794 - text = text.replace('%', ' percent')
795 - # Spell out single-digit numbers
796 - num_to_word = {'0':'zero', '1':'one', '2':'two', '3':'three',
797 - '4':'four', '5':'five', '6':'six', '7':'seven',
798 - '8':'eight', '9':'nine'}
799 - for num, word in num_to_word.items():
800 - text = re.sub(rf'\b{num}\b', word, text)
801 -
802 - # Step 6: Common abbreviations (English only in v1)
803 - if language == 'en':
804 - text = text.replace('covid-19', 'covid')
805 - text = text.replace('u.s.', 'us')
806 - text = text.replace('u.k.', 'uk')
807 -
808 - # Step 7: NO entity normalization in v1
809 - # (Trump vs Donald Trump vs President Trump remain distinct)
810 -
811 - return text
447 + """
448 + Normalizes claim to canonical form for cache key generation.
449 + Version: v1norm1 (POC1)
450 + """
451 + import re
452 + import unicodedata
453 +
454 + # Step 1: Unicode normalization (NFC)
455 + text = unicodedata.normalize('NFC', claim_text)
456 +
457 + # Step 2: Lowercase
458 + text = text.lower()
459 +
460 + # Step 3: Remove punctuation (except hyphens in words)
461 + text = re.sub(r'[^\w\s-]', '', text)
462 +
463 + # Step 4: Normalize whitespace (collapse multiple spaces)
464 + text = re.sub(r'\s+', ' ', text).strip()
465 +
466 + # Step 5: Numeric normalization
467 + text = text.replace('%', ' percent')
468 + # Spell out single-digit numbers
469 + num_to_word = {'0':'zero', '1':'one', '2':'two', '3':'three',
470 + '4':'four', '5':'five', '6':'six', '7':'seven',
471 + '8':'eight', '9':'nine'}
472 + for num, word in num_to_word.items():
473 + text = re.sub(rf'\b{num}\b', word, text)
474 +
475 + # Step 6: Common abbreviations (English only in v1)
476 + if language == 'en':
477 + text = text.replace('covid-19', 'covid')
478 + text = text.replace('u.s.', 'us')
479 + text = text.replace('u.k.', 'uk')
480 +
481 + # Step 7: NO entity normalization in v1
482 + # (Trump vs Donald Trump vs President Trump remain distinct)
483 +
484 + return text
812 812  
813 813  # Version identifier (include in cache namespace)
814 814  CANONICALIZER_VERSION = "v1norm1"
... ... @@ -821,19 +821,19 @@
821 821  cache_key = f"claim:{CANONICALIZER_VERSION}:{language}:{sha256(canonical)}"
822 822  
823 823  Example:
824 - claim: "COVID-19 vaccines are 95% effective"
825 - canonical: "covid vaccines are 95 percent effective"
826 - sha256: abc123...def456
827 - key: "claim:v1norm1:en:abc123...def456"
497 + claim: "COVID-19 vaccines are 95% effective"
498 + canonical: "covid vaccines are 95 percent effective"
499 + sha256: abc123...def456
500 + key: "claim:v1norm1:en:abc123...def456"
828 828  }}}
829 829  
830 830  **Cache Metadata MUST Include:**
831 831  
832 832  {{{{
833 - "canonical_claim": "covid vaccines are 95 percent effective",
834 - "canonicalizer_version": "v1norm1",
835 - "language": "en",
836 - "original_claim_samples": ["COVID-19 vaccines are 95% effective"]
506 + "canonical_claim": "covid vaccines are 95 percent effective",
507 + "canonicalizer_version": "v1norm1",
508 + "language": "en",
509 + "original_claim_samples": ["COVID-19 vaccines are 95% effective"]
837 837  }
838 838  }}}
839 839