Changes for page FactHarbor POC1 Architecture Analysis 1.Jan.26
Last modified by Robert Schaub on 2026/02/08 08:12
From version 7.1
edited by Robert Schaub
on 2026/01/02 10:12
on 2026/01/02 10:12
Change comment:
There is no comment for this version
To version 1.1
edited by Robert Schaub
on 2026/01/02 09:59
on 2026/01/02 09:59
Change comment:
There is no comment for this version
Summary
-
Page properties (1 modified, 0 added, 0 removed)
Details
- Page properties
-
- Content
-
... ... @@ -1,12 +2,14 @@ 1 -= FactHarbor POC1 Architecture Analysis = 2 2 2 += FactHarbor POC1 Architecture Analysis= 3 + 4 + 3 3 **Version:** 2.6.17 4 4 **Analysis Date:** January 2026 5 5 **Document Purpose:** Technical diagrams, gap analysis, and optimization recommendations 6 6 7 ---- -9 +--- 8 8 9 -== 1. AKEL Flow Diagram (with LLM and WebSearch Interactions) ==11 +== 1. AKEL Flow Diagram (with LLM and WebSearch Interactions)== 10 10 11 11 12 12 {{mermaid}} ... ... @@ -90,123 +90,12 @@ 90 90 class UNDERSTAND,DECIDE,FETCHSRC,EXTRACT,VERDICT,REPORT step 91 91 {{/mermaid}} 92 92 93 ---- -95 +--- 94 94 95 -== 2. ERD Data Model (Current POC1 Implementation) == 96 96 97 - **DataObjectsERD**98 +== 2. ERD Data Model (Current POC1 Implementation)== 98 98 99 -{{mermaid}} 100 -erDiagram 101 - ARTICLE ||--o{ CLAIM : "contains" 102 - ARTICLE ||--|| ARTICLE_VERDICT : "has" 103 - CLAIM ||--|| CLAIM_VERDICT : "has" 104 - CLAIM ||--o{ CLAIM : "depends on" 105 - CLAIM_VERDICT }o--o{ EVIDENCE : "supported by" 106 - SOURCE ||--o{ EVIDENCE : "provides" 107 - ARTICLE ||--o{ SOURCE : "references" 108 108 109 - ARTICLE { 110 - string id PK "Unique identifier (job ID)" 111 - string inputType "text | url" 112 - string inputValue "Original URL or text" 113 - string articleThesis "Main argument/thesis" 114 - string detectedInputType "question | claim | article" 115 - boolean isQuestion "True if input is a question" 116 - datetime createdAt "Analysis timestamp" 117 - datetime updatedAt "Last update" 118 - json distinctProceedings "Legal proceedings if any" 119 - boolean hasMultipleProceedings "Multi-proceeding flag" 120 - string proceedingContext "Context for proceedings" 121 - json logicalFallacies "Detected fallacies array" 122 - boolean isPseudoscience "Pseudoscience detection" 123 - string_array pseudoscienceCategories "Categories if detected" 124 - int llmCalls "Total LLM API calls" 125 - json searchQueries "All search queries performed" 126 - string schemaVersion "e.g. 2.6.17" 127 - } 128 - 129 - CLAIM { 130 - string id PK "SC1, SC2, C1, etc." 131 - string articleId FK "Parent article" 132 - string text "The claim statement" 133 - string type "legal | procedural | factual | evaluative" 134 - string claimRole "attribution | source | timing | core" 135 - string_array dependsOn "IDs of prerequisite claims" 136 - string_array keyEntities "Named entities in claim" 137 - boolean isCentral "Is this a central claim?" 138 - string relatedProceedingId "Linked proceeding if any" 139 - int startOffset "Position in original text" 140 - int endOffset "End position in original text" 141 - string approximatePosition "Descriptive position" 142 - } 143 - 144 - CLAIM_VERDICT { 145 - string id PK "Same as claim ID" 146 - string claimId FK "Reference to claim" 147 - string llmVerdict "WELL-SUPPORTED | PARTIALLY-SUPPORTED | UNCERTAIN | REFUTED" 148 - string verdict "True | Mostly True | Leaning True | Unverified | Leaning False | Mostly False | False" 149 - int confidence "0-100 LLM confidence" 150 - int truthPercentage "0-100 calibrated truth score" 151 - string riskTier "A (high) | B (medium) | C (low)" 152 - string reasoning "Explanation of verdict" 153 - string_array supportingFactIds "Evidence IDs supporting this" 154 - boolean dependencyFailed "True if prerequisite failed" 155 - string_array failedDependencies "Which deps failed" 156 - string highlightColor "green | light-green | yellow | orange | dark-orange | red | dark-red" 157 - boolean isPseudoscience "Pseudoscience flag" 158 - string escalationReason "Why verdict was escalated" 159 - } 160 - 161 - ARTICLE_VERDICT { 162 - string id PK "Same as article ID" 163 - string articleId FK "Reference to article" 164 - string llmArticleVerdict "Original LLM verdict" 165 - int llmArticleConfidence "Original LLM confidence" 166 - string articleVerdict "True | Mostly True | Leaning True | Unverified | Leaning False | Mostly False | False" 167 - int articleTruthPercentage "0-100 calibrated score" 168 - string articleVerdictReason "Why verdict differs from claims avg" 169 - int claimsAverageTruthPercentage "Average of claim verdicts" 170 - string claimsAverageVerdict "7-point average verdict" 171 - int claimsTotal "Total claims analyzed" 172 - int claimsSupported "Claims with truth >= 72%" 173 - int claimsUncertain "Claims with truth 43-71%" 174 - int claimsRefuted "Claims with truth < 43%" 175 - int centralClaimsTotal "Number of central claims" 176 - int centralClaimsSupported "Central claims supported" 177 - } 178 - 179 - EVIDENCE { 180 - string id PK "S1-F1, S1-F2 format" 181 - string sourceId FK "Reference to source" 182 - string claimId FK "Optional: specific claim this supports" 183 - string fact "The factual statement extracted" 184 - string category "legal_provision | evidence | expert_quote | statistic | event | criticism" 185 - string specificity "high | medium" 186 - string sourceExcerpt "Original text excerpt" 187 - string relatedProceedingId "Linked proceeding if any" 188 - boolean isContestedClaim "Is this a contested assertion" 189 - string claimSource "Who made contested claim" 190 - } 191 - 192 - SOURCE { 193 - string id PK "S1, S2, etc." 194 - string articleId FK "Parent article" 195 - string url "Full URL" 196 - string title "Page/document title" 197 - string domain "Extracted domain" 198 - int trackRecordScore "0-100 reliability score or null" 199 - string fullText "Extracted content" 200 - datetime fetchedAt "When content was fetched" 201 - string category "news | academic | government | legal" 202 - boolean fetchSuccess "True if fetch succeeded" 203 - string searchQuery "Which query found this" 204 - string mimeType "text/html | application/pdf" 205 - } 206 -{{/mermaid}} 207 - 208 -**Data Usage ERD** 209 - 210 210 {{mermaid}} 211 211 erDiagram 212 212 JOB ||--o{ JOB_EVENT : "has" ... ... @@ -296,10 +296,12 @@ 296 296 } 297 297 {{/mermaid}} 298 298 299 ---- -190 +--- 300 300 301 -== 3. Overall Architecture with Interactions == 302 302 193 +== 3. Overall Architecture with Interactions== 194 + 195 + 303 303 {{mermaid}} 304 304 flowchart TB 305 305 subgraph Client["🖥️ Client Layer"] ... ... @@ -393,64 +393,77 @@ 393 393 class ANALYZE_API,JOBS_API,JOB_API,EVENTS_API,RUN_JOB api 394 394 {{/mermaid}} 395 395 396 ---- -289 +--- 397 397 398 -== 4. Specification vs Implementation Gap Analysis == 399 399 400 -== =4.1Data Model Gaps===292 +== 4. Specification vs Implementation Gap Analysis== 401 401 402 -| Specification Entity | POC1 Status | Gap Description | 403 -|-|-|-| 404 -| **Claim** | ⚠️ Partial | No persistent storage; claims exist only in JSON result. Missing: `status`, `confidence_score`, `risk_score`, `completeness_score`, `version`, `views`, `edit_count` | 405 -| **Evidence** | ⚠️ Partial | Implemented as `ExtractedFact` but lacks: `supports` enum, proper `relevance_score` | 406 -| **Source** | ⚠️ Partial | `FetchedSource` exists but missing: `type` enum, `accuracy_history`, `correction_frequency`, weekly update scheduler | 407 -| **Scenario** | ❌ Missing | Not implemented. Claims are evaluated directly without scenario contexts | 408 -| **Verdict** | ⚠️ Partial | `ClaimVerdict` exists but missing: `likelihood_range`, `uncertainty_factors` array, proper `explanation_summary` | 409 -| **User** | ❌ Missing | No user authentication or role system | 410 -| **Edit** | ❌ Missing | No audit trail for changes | 411 411 295 + 296 +=== 4.1 Data Model Gaps=== 297 + 298 + 299 +| Specification Entity | POC1 Status | Gap Description | 300 +|---------------------|-------------|-----------------| 301 +| **Claim** | ⚠️ Partial | No persistent storage; claims exist only in JSON result. Missing: `status`, `confidence_score`, `risk_score`, `completeness_score`, `version`, `views`, `edit_count` | 302 +| **Evidence** | ⚠️ Partial | Implemented as `ExtractedFact` but lacks: `supports` enum, proper `relevance_score` | 303 +| **Source** | ⚠️ Partial | `FetchedSource` exists but missing: `type` enum, `accuracy_history`, `correction_frequency`, weekly update scheduler | 304 +| **Scenario** | ❌ Missing | Not implemented. Claims are evaluated directly without scenario contexts | 305 +| **Verdict** | ⚠️ Partial | `ClaimVerdict` exists but missing: `likelihood_range`, `uncertainty_factors` array, proper `explanation_summary` | 306 +| **User** | ❌ Missing | No user authentication or role system | 307 +| **Edit** | ❌ Missing | No audit trail for changes | 308 + 309 + 412 412 === 4.2 AKEL Component Gaps === 413 413 414 -| Spec Component | POC1 Status | Gap Description | 415 -| |-|-|416 -| **AKEL Orchestrator** | ✅ Implemented | `runAnalysis()` function serves this role | 417 -| **Claim Extractor** | ✅ Implemented | `understandClaim()` with claim role/dependency tracking | 418 -| **Claim Classifier** | ⚠️ Partial | Risk tier (A/B/C) assigned, but no domain classification | 419 -| **Scenario Generator** | ❌ Missing | Claims evaluated without scenario extraction | 420 -| **Evidence Summarizer** | ✅ Implemented | `extractFacts()` function | 421 -| **Contradiction Detector** | ⚠️ Partial | `isContestedClaim` flag exists but no active contradiction search | 422 -| **Quality Gate Validator** | ❌ Missing | No source quality gates, no mandatory checks | 423 -| **Audit Sampling Scheduler** | ❌ Missing | No audit system | 424 -| **Embedding Handler** | ❌ Missing | Not needed for POC | 425 -| **Federation Sync** | ❌ Missing | Not needed for POC | 312 +| Spec Component | POC1 Status | Gap Description | 313 +|----------------|-------------|-----------------| 314 +| **AKEL Orchestrator** | ✅ Implemented | `runAnalysis()` function serves this role | 315 +| **Claim Extractor** | ✅ Implemented | `understandClaim()` with claim role/dependency tracking | 316 +| **Claim Classifier** | ⚠️ Partial | Risk tier (A/B/C) assigned, but no domain classification | 317 +| **Scenario Generator** | ❌ Missing | Claims evaluated without scenario extraction | 318 +| **Evidence Summarizer** | ✅ Implemented | `extractFacts()` function | 319 +| **Contradiction Detector** | ⚠️ Partial | `isContestedClaim` flag exists but no active contradiction search | 320 +| **Quality Gate Validator** | ❌ Missing | No source quality gates, no mandatory checks | 321 +| **Audit Sampling Scheduler** | ❌ Missing | No audit system | 322 +| **Embedding Handler** | ❌ Missing | Not needed for POC | 323 +| **Federation Sync** | ❌ Missing | Not needed for POC | 426 426 427 -=== 4.3 Architecture Gaps === 428 428 429 -| Spec Requirement | POC1 Status | Gap Description | 430 -| |-|-| 431 -| **Three-Layer Architecture** | ✅ Implemented | Interface (Next.js) → Processing (AKEL) → Data (SQLite) | 432 -| **LLM Abstraction Layer** | ✅ Implemented | AI SDK supports multiple providers with failover | 433 -| **PostgreSQL Primary DB** | ⚠️ Different | Using SQLite for simplicity (acceptable for POC) | 434 -| **Redis Caching** | ❌ Missing | No caching layer | 435 -| **S3 Archival** | ❌ Missing | No long-term storage | 436 -| **Background Jobs** | ❌ Missing | No scheduler for source updates, cache warming | 437 -| **Quality Monitoring** | ⚠️ Partial | LLM call counting exists, but no anomaly detection | 326 +=== 4.3 Architecture Gaps=== 438 438 439 -=== 4.4 Publication & Review Gaps === 440 440 441 -| Spec Feature | POC1 Status | Gap Description | 442 -| |-|-| 443 -| **Risk Tier Publication Rules** | ❌ Missing | All results published immediately regardless of tier | 444 -| **Human Review Queue** | ❌ Missing | No review workflow | 445 -| **AI-Generated Labeling** | ⚠️ Partial | Results show "AI analysis" but no formal labeling system | 446 -| **Audit Rate Sampling** | ❌ Missing | No sampling audits | 329 +| Spec Requirement | POC1 Status | Gap Description | 330 +|------------------|-------------|-----------------| 331 +| **Three-Layer Architecture** | ✅ Implemented | Interface (Next.js) → Processing (AKEL) → Data (SQLite) | 332 +| **LLM Abstraction Layer** | ✅ Implemented | AI SDK supports multiple providers with failover | 333 +| **PostgreSQL Primary DB** | ⚠️ Different | Using SQLite for simplicity (acceptable for POC) | 334 +| **Redis Caching** | ❌ Missing | No caching layer | 335 +| **S3 Archival** | ❌ Missing | No long-term storage | 336 +| **Background Jobs** | ❌ Missing | No scheduler for source updates, cache warming | 337 +| **Quality Monitoring** | ⚠️ Partial | LLM call counting exists, but no anomaly detection | 447 447 448 ----- 449 449 450 -== 5.Optimization Recommendations==340 +=== 4.4 Publication & Review Gaps=== 451 451 452 -=== 5.1 Cost Optimizations === 453 453 343 +| Spec Feature | POC1 Status | Gap Description | 344 +|--------------|-------------|-----------------| 345 +| **Risk Tier Publication Rules** | ❌ Missing | All results published immediately regardless of tier | 346 +| **Human Review Queue** | ❌ Missing | No review workflow | 347 +| **AI-Generated Labeling** | ⚠️ Partial | Results show "AI analysis" but no formal labeling system | 348 +| **Audit Rate Sampling** | ❌ Missing | No sampling audits | 349 + 350 +--- 351 + 352 + 353 +== 5. Optimization Recommendations== 354 + 355 + 356 + 357 +=== 5.1 Cost Optimizations=== 358 + 359 + 454 454 {{mermaid}} 455 455 pie title Current LLM Cost Distribution (Estimated per Analysis) 456 456 "Step 1: Understand" : 15 ... ... @@ -458,16 +458,18 @@ 458 458 "Step 3: Verdicts" : 25 459 459 {{/mermaid}} 460 460 461 -| Optimization | Estimated Savings | Implementation Effort | 462 -| |-||463 -| **Cache claim understanding** | 30-50% on repeated claims | Medium | 464 -| **Use Haiku for fact extraction** | 40% on Step 2 costs | Low (config change) | 465 -| **Batch fact extraction** | 20% fewer API calls | Medium | 466 -| **Skip search for known claims** | 50%+ for cached claims | High (needs claim DB) | 467 -| **Reduce max iterations** | Linear reduction | Low (config change) | 367 +| Optimization | Estimated Savings | Implementation Effort | 368 +|--------------|-------------------|----------------------| 369 +| **Cache claim understanding** | 30-50% on repeated claims | Medium | 370 +| **Use Haiku for fact extraction** | 40% on Step 2 costs | Low (config change) | 371 +| **Batch fact extraction** | 20% fewer API calls | Medium | 372 +| **Skip search for known claims** | 50%+ for cached claims | High (needs claim DB) | 373 +| **Reduce max iterations** | Linear reduction | Low (config change) | 468 468 469 -=== 5.2 Timing Optimizations === 470 470 376 +=== 5.2 Timing Optimizations=== 377 + 378 + 471 471 {{mermaid}} 472 472 gantt 473 473 title Current Analysis Timeline (Typical) ... ... @@ -493,16 +493,18 @@ 493 493 Generate Verdicts :b5, after b4, 10s 494 494 {{/mermaid}} 495 495 496 -| Optimization | Time Savings | Notes | 497 -| ||-|498 -| **Parallel source fetching** | Already implemented | Currently fetches 3 sources in parallel | 499 -| **Streaming LLM responses** | 20-30% perceived | User sees progress faster | 500 -| **Search query batching** | 10-15% | Send multiple queries to search API | 501 -| **Reduce prompt size** | 5-10% per call | Optimize system prompts | 502 -| **Use faster models for extraction** | 30-40% on Step 2 | Claude Haiku vs Sonnet | 404 +| Optimization | Time Savings | Notes | 405 +|--------------|--------------|-------| 406 +| **Parallel source fetching** | Already implemented | Currently fetches 3 sources in parallel | 407 +| **Streaming LLM responses** | 20-30% perceived | User sees progress faster | 408 +| **Search query batching** | 10-15% | Send multiple queries to search API | 409 +| **Reduce prompt size** | 5-10% per call | Optimize system prompts | 410 +| **Use faster models for extraction** | 30-40% on Step 2 | Claude Haiku vs Sonnet | 503 503 504 -=== 5.3 Priority Recommendations === 505 505 413 +=== 5.3 Priority Recommendations=== 414 + 415 + 506 506 1. **HIGH PRIORITY - Implement Claim Caching** 507 507 - Cache claim verdicts by content hash 508 508 - Reduces costs for repeated/similar claims ... ... @@ -518,12 +518,16 @@ 518 518 - Cache search results (1h TTL) 519 519 - Reduces external API calls 520 520 521 ---- -431 +--- 522 522 523 -== 6. Separated Verdict Architecture Proposal == 524 524 525 -== =6.1Current Architecture ===434 +== 6. Separated Verdict Architecture Proposal== 526 526 436 + 437 + 438 +=== 6.1 Current Architecture=== 439 + 440 + 527 527 {{mermaid}} 528 528 flowchart LR 529 529 subgraph Current["Current: Monolithic Analysis"] ... ... @@ -539,8 +539,10 @@ 539 539 - No caching of individual claim verdicts 540 540 - Article verdict tightly coupled to claim extraction 541 541 542 -=== 6.2 Proposed Separated Architecture === 543 543 457 +=== 6.2 Proposed Separated Architecture=== 458 + 459 + 544 544 {{mermaid}} 545 545 flowchart TB 546 546 subgraph Input["Input Processing"] ... ... @@ -593,25 +593,30 @@ 593 593 class CONTEXT,ARTICLE_VERDICT dynamic 594 594 {{/mermaid}} 595 595 596 -=== 6.3 Benefits Analysis === 597 597 598 -| Benefit | Impact | Rationale | 599 -|-| |-| 600 -| **Cost Reduction** | 40-70% for repeated claims | Many articles share common claims (e.g., "COVID vaccines are safe") | 601 -| **Faster Analysis** | 50%+ for cached claims | Skip research + LLM calls for known claims | 602 -| **Consistency** | High | Same claim always gets same verdict (until cache expires) | 603 -| **Freshness Control** | Configurable TTL | Balance consistency vs. new evidence | 604 -| **Scalability** | Linear improvement | More users = higher cache hit rate | 513 +=== 6.3 Benefits Analysis=== 605 605 515 + 516 +| Benefit | Impact | Rationale | 517 +|---------|--------|-----------| 518 +| **Cost Reduction** | 40-70% for repeated claims | Many articles share common claims (e.g., "COVID vaccines are safe") | 519 +| **Faster Analysis** | 50%+ for cached claims | Skip research + LLM calls for known claims | 520 +| **Consistency** | High | Same claim always gets same verdict (until cache expires) | 521 +| **Freshness Control** | Configurable TTL | Balance consistency vs. new evidence | 522 +| **Scalability** | Linear improvement | More users = higher cache hit rate | 523 + 524 + 606 606 === 6.4 Implementation Considerations === 607 607 608 608 **Claim Hashing Strategy:** 609 -{{code language="typescript"}}function getClaimHash(claim: string): string { 528 +{{code language="typescript"}} 529 +function getClaimHash(claim: string): string { 610 610 // Normalize: lowercase, remove punctuation, stem words 611 611 const normalized = normalize(claim); 612 612 // Hash for cache key 613 613 return crypto.createHash('sha256').update(normalized).digest('hex').slice(0, 16); 614 -}{{/code}} 534 +} 535 +{{/code}} 615 615 616 616 **Cache Invalidation Triggers:** 617 617 - TTL expiration (default 7 days) ... ... @@ -624,7 +624,7 @@ 624 624 - Same claims in different article contexts may yield different article verdicts 625 625 - Example: "Vaccines are safe" + "Vaccines cause autism" → article may be misleading even if first claim is true 626 626 627 -### 6.5 Recommendation ##548 +### 6.5 Recommendation 628 628 629 629 **YES, separating is beneficial** with the following caveats: 630 630 ... ... @@ -640,19 +640,23 @@ 640 640 - Phase 2: Semantic similarity caching (embedding-based) 641 641 - Phase 3: Federated claim sharing across instances 642 642 643 ---- -564 +--- 644 644 645 -== 7. Summary == 646 646 647 -== =CurrentState===567 +== 7. Summary== 648 648 569 + 570 + 571 +=== Current State=== 572 + 649 649 - POC1 implements core AKEL pipeline successfully 650 650 - Claim dependency tracking is implemented 651 651 - Multiple LLM providers supported 652 652 - No persistent claim storage or caching 653 653 654 -=== Key Gaps from Specification === 655 655 579 +=== Key Gaps from Specification=== 580 + 656 656 - No scenario extraction 657 657 - No user/role system 658 658 - No audit trail ... ... @@ -659,10 +659,13 @@ 659 659 - No source track record updates 660 660 - No review queue 661 661 662 -=== Recommended Next Steps === 663 663 588 +=== Recommended Next Steps=== 589 + 664 664 1. Implement claim caching layer 665 665 2. Separate claim vs article verdict generation 666 666 3. Add Redis for source/search caching 667 667 4. Implement tiered model selection 668 668 5. Add basic audit logging 595 + 596 +