Changes for page FactHarbor POC1 Architecture Analysis 1.Jan.26
Last modified by Robert Schaub on 2026/02/08 08:12
From version 1.1
edited by Robert Schaub
on 2026/01/02 09:59
on 2026/01/02 09:59
Change comment:
There is no comment for this version
To version 11.1
edited by Robert Schaub
on 2026/01/02 10:14
on 2026/01/02 10:14
Change comment:
There is no comment for this version
Summary
-
Page properties (2 modified, 0 added, 0 removed)
Details
- Page properties
-
- Title
-
... ... @@ -1,1 +1,1 @@ 1 -FactHarbor POC1 Architecture Analysis 1 +FactHarbor POC1 Architecture Analysis 1.Jan.26 - Content
-
... ... @@ -1,14 +1,12 @@ 1 += FactHarbor POC1 Architecture Analysis = 1 1 2 -= FactHarbor POC1 Architecture Analysis= 3 - 4 - 5 5 **Version:** 2.6.17 6 6 **Analysis Date:** January 2026 7 7 **Document Purpose:** Technical diagrams, gap analysis, and optimization recommendations 8 8 9 ---- 7 +---- 10 10 11 -== 1. AKEL Flow Diagram (with LLM and WebSearch Interactions)== 9 +== 1. AKEL Flow Diagram (with LLM and WebSearch Interactions) == 12 12 13 13 14 14 {{mermaid}} ... ... @@ -92,12 +92,123 @@ 92 92 class UNDERSTAND,DECIDE,FETCHSRC,EXTRACT,VERDICT,REPORT step 93 93 {{/mermaid}} 94 94 95 ---- 93 +---- 96 96 95 +== 2. ERD Data Model (Current POC1 Implementation) == 97 97 98 - == 2. ERDDataModel (Current POC1 Implementation)==97 +**Data Objects ERD** 99 99 99 +{{mermaid}} 100 +erDiagram 101 + ARTICLE ||--o{ CLAIM : "contains" 102 + ARTICLE ||--|| ARTICLE_VERDICT : "has" 103 + CLAIM ||--|| CLAIM_VERDICT : "has" 104 + CLAIM ||--o{ CLAIM : "depends on" 105 + CLAIM_VERDICT }o--o{ EVIDENCE : "supported by" 106 + SOURCE ||--o{ EVIDENCE : "provides" 107 + ARTICLE ||--o{ SOURCE : "references" 100 100 109 + ARTICLE { 110 + string id PK "Unique identifier (job ID)" 111 + string inputType "text | url" 112 + string inputValue "Original URL or text" 113 + string articleThesis "Main argument/thesis" 114 + string detectedInputType "question | claim | article" 115 + boolean isQuestion "True if input is a question" 116 + datetime createdAt "Analysis timestamp" 117 + datetime updatedAt "Last update" 118 + json distinctProceedings "Legal proceedings if any" 119 + boolean hasMultipleProceedings "Multi-proceeding flag" 120 + string proceedingContext "Context for proceedings" 121 + json logicalFallacies "Detected fallacies array" 122 + boolean isPseudoscience "Pseudoscience detection" 123 + string_array pseudoscienceCategories "Categories if detected" 124 + int llmCalls "Total LLM API calls" 125 + json searchQueries "All search queries performed" 126 + string schemaVersion "e.g. 2.6.17" 127 + } 128 + 129 + CLAIM { 130 + string id PK "SC1, SC2, C1, etc." 131 + string articleId FK "Parent article" 132 + string text "The claim statement" 133 + string type "legal | procedural | factual | evaluative" 134 + string claimRole "attribution | source | timing | core" 135 + string_array dependsOn "IDs of prerequisite claims" 136 + string_array keyEntities "Named entities in claim" 137 + boolean isCentral "Is this a central claim?" 138 + string relatedProceedingId "Linked proceeding if any" 139 + int startOffset "Position in original text" 140 + int endOffset "End position in original text" 141 + string approximatePosition "Descriptive position" 142 + } 143 + 144 + CLAIM_VERDICT { 145 + string id PK "Same as claim ID" 146 + string claimId FK "Reference to claim" 147 + string llmVerdict "WELL-SUPPORTED | PARTIALLY-SUPPORTED | UNCERTAIN | REFUTED" 148 + string verdict "True | Mostly True | Leaning True | Unverified | Leaning False | Mostly False | False" 149 + int confidence "0-100 LLM confidence" 150 + int truthPercentage "0-100 calibrated truth score" 151 + string riskTier "A (high) | B (medium) | C (low)" 152 + string reasoning "Explanation of verdict" 153 + string_array supportingFactIds "Evidence IDs supporting this" 154 + boolean dependencyFailed "True if prerequisite failed" 155 + string_array failedDependencies "Which deps failed" 156 + string highlightColor "green | light-green | yellow | orange | dark-orange | red | dark-red" 157 + boolean isPseudoscience "Pseudoscience flag" 158 + string escalationReason "Why verdict was escalated" 159 + } 160 + 161 + ARTICLE_VERDICT { 162 + string id PK "Same as article ID" 163 + string articleId FK "Reference to article" 164 + string llmArticleVerdict "Original LLM verdict" 165 + int llmArticleConfidence "Original LLM confidence" 166 + string articleVerdict "True | Mostly True | Leaning True | Unverified | Leaning False | Mostly False | False" 167 + int articleTruthPercentage "0-100 calibrated score" 168 + string articleVerdictReason "Why verdict differs from claims avg" 169 + int claimsAverageTruthPercentage "Average of claim verdicts" 170 + string claimsAverageVerdict "7-point average verdict" 171 + int claimsTotal "Total claims analyzed" 172 + int claimsSupported "Claims with truth >= 72%" 173 + int claimsUncertain "Claims with truth 43-71%" 174 + int claimsRefuted "Claims with truth < 43%" 175 + int centralClaimsTotal "Number of central claims" 176 + int centralClaimsSupported "Central claims supported" 177 + } 178 + 179 + EVIDENCE { 180 + string id PK "S1-F1, S1-F2 format" 181 + string sourceId FK "Reference to source" 182 + string claimId FK "Optional: specific claim this supports" 183 + string fact "The factual statement extracted" 184 + string category "legal_provision | evidence | expert_quote | statistic | event | criticism" 185 + string specificity "high | medium" 186 + string sourceExcerpt "Original text excerpt" 187 + string relatedProceedingId "Linked proceeding if any" 188 + boolean isContestedClaim "Is this a contested assertion" 189 + string claimSource "Who made contested claim" 190 + } 191 + 192 + SOURCE { 193 + string id PK "S1, S2, etc." 194 + string articleId FK "Parent article" 195 + string url "Full URL" 196 + string title "Page/document title" 197 + string domain "Extracted domain" 198 + int trackRecordScore "0-100 reliability score or null" 199 + string fullText "Extracted content" 200 + datetime fetchedAt "When content was fetched" 201 + string category "news | academic | government | legal" 202 + boolean fetchSuccess "True if fetch succeeded" 203 + string searchQuery "Which query found this" 204 + string mimeType "text/html | application/pdf" 205 + } 206 +{{/mermaid}} 207 + 208 +**Data Usage ERD** 209 + 101 101 {{mermaid}} 102 102 erDiagram 103 103 JOB ||--o{ JOB_EVENT : "has" ... ... @@ -187,12 +187,10 @@ 187 187 } 188 188 {{/mermaid}} 189 189 190 ---- 299 +---- 191 191 301 +== 3. Overall Architecture with Interactions == 192 192 193 -== 3. Overall Architecture with Interactions== 194 - 195 - 196 196 {{mermaid}} 197 197 flowchart TB 198 198 subgraph Client["🖥️ Client Layer"] ... ... @@ -286,77 +286,64 @@ 286 286 class ANALYZE_API,JOBS_API,JOB_API,EVENTS_API,RUN_JOB api 287 287 {{/mermaid}} 288 288 289 ---- 396 +---- 290 290 398 +== 4. Specification vs Implementation Gap Analysis == 291 291 292 -== 4. Specification vs ImplementationGapAnalysis==400 +=== 4.1 Data Model Gaps === 293 293 402 +| Specification Entity | POC1 Status | Gap Description | 403 +|-|-|-| 404 +| **Claim** | ⚠️ Partial | No persistent storage; claims exist only in JSON result. Missing: `status`, `confidence_score`, `risk_score`, `completeness_score`, `version`, `views`, `edit_count` | 405 +| **Evidence** | ⚠️ Partial | Implemented as `ExtractedFact` but lacks: `supports` enum, proper `relevance_score` | 406 +| **Source** | ⚠️ Partial | `FetchedSource` exists but missing: `type` enum, `accuracy_history`, `correction_frequency`, weekly update scheduler | 407 +| **Scenario** | ❌ Missing | Not implemented. Claims are evaluated directly without scenario contexts | 408 +| **Verdict** | ⚠️ Partial | `ClaimVerdict` exists but missing: `likelihood_range`, `uncertainty_factors` array, proper `explanation_summary` | 409 +| **User** | ❌ Missing | No user authentication or role system | 410 +| **Edit** | ❌ Missing | No audit trail for changes | 294 294 295 - 296 -=== 4.1 Data Model Gaps=== 297 - 298 - 299 -| Specification Entity | POC1 Status | Gap Description | 300 -|---------------------|-------------|-----------------| 301 -| **Claim** | ⚠️ Partial | No persistent storage; claims exist only in JSON result. Missing: `status`, `confidence_score`, `risk_score`, `completeness_score`, `version`, `views`, `edit_count` | 302 -| **Evidence** | ⚠️ Partial | Implemented as `ExtractedFact` but lacks: `supports` enum, proper `relevance_score` | 303 -| **Source** | ⚠️ Partial | `FetchedSource` exists but missing: `type` enum, `accuracy_history`, `correction_frequency`, weekly update scheduler | 304 -| **Scenario** | ❌ Missing | Not implemented. Claims are evaluated directly without scenario contexts | 305 -| **Verdict** | ⚠️ Partial | `ClaimVerdict` exists but missing: `likelihood_range`, `uncertainty_factors` array, proper `explanation_summary` | 306 -| **User** | ❌ Missing | No user authentication or role system | 307 -| **Edit** | ❌ Missing | No audit trail for changes | 308 - 309 - 310 310 === 4.2 AKEL Component Gaps === 311 311 312 -| Spec Component | POC1 Status | Gap Description | 313 -| ----------------|-------------|-----------------|314 -| **AKEL Orchestrator** | ✅ Implemented | `runAnalysis()` function serves this role | 315 -| **Claim Extractor** | ✅ Implemented | `understandClaim()` with claim role/dependency tracking | 316 -| **Claim Classifier** | ⚠️ Partial | Risk tier (A/B/C) assigned, but no domain classification | 317 -| **Scenario Generator** | ❌ Missing | Claims evaluated without scenario extraction | 318 -| **Evidence Summarizer** | ✅ Implemented | `extractFacts()` function | 319 -| **Contradiction Detector** | ⚠️ Partial | `isContestedClaim` flag exists but no active contradiction search | 320 -| **Quality Gate Validator** | ❌ Missing | No source quality gates, no mandatory checks | 321 -| **Audit Sampling Scheduler** | ❌ Missing | No audit system | 322 -| **Embedding Handler** | ❌ Missing | Not needed for POC | 323 -| **Federation Sync** | ❌ Missing | Not needed for POC | 414 +| Spec Component | POC1 Status | Gap Description | 415 +| |-|-| 416 +| **AKEL Orchestrator** | ✅ Implemented | `runAnalysis()` function serves this role | 417 +| **Claim Extractor** | ✅ Implemented | `understandClaim()` with claim role/dependency tracking | 418 +| **Claim Classifier** | ⚠️ Partial | Risk tier (A/B/C) assigned, but no domain classification | 419 +| **Scenario Generator** | ❌ Missing | Claims evaluated without scenario extraction | 420 +| **Evidence Summarizer** | ✅ Implemented | `extractFacts()` function | 421 +| **Contradiction Detector** | ⚠️ Partial | `isContestedClaim` flag exists but no active contradiction search | 422 +| **Quality Gate Validator** | ❌ Missing | No source quality gates, no mandatory checks | 423 +| **Audit Sampling Scheduler** | ❌ Missing | No audit system | 424 +| **Embedding Handler** | ❌ Missing | Not needed for POC | 425 +| **Federation Sync** | ❌ Missing | Not needed for POC | 324 324 427 +=== 4.3 Architecture Gaps === 325 325 326 -=== 4.3 Architecture Gaps=== 429 +| Spec Requirement | POC1 Status | Gap Description | 430 +| |-|-| 431 +| **Three-Layer Architecture** | ✅ Implemented | Interface (Next.js) → Processing (AKEL) → Data (SQLite) | 432 +| **LLM Abstraction Layer** | ✅ Implemented | AI SDK supports multiple providers with failover | 433 +| **PostgreSQL Primary DB** | ⚠️ Different | Using SQLite for simplicity (acceptable for POC) | 434 +| **Redis Caching** | ❌ Missing | No caching layer | 435 +| **S3 Archival** | ❌ Missing | No long-term storage | 436 +| **Background Jobs** | ❌ Missing | No scheduler for source updates, cache warming | 437 +| **Quality Monitoring** | ⚠️ Partial | LLM call counting exists, but no anomaly detection | 327 327 439 +=== 4.4 Publication & Review Gaps === 328 328 329 -| Spec Requirement | POC1 Status | Gap Description | 330 -|------------------|-------------|-----------------| 331 -| **Three-Layer Architecture** | ✅ Implemented | Interface (Next.js) → Processing (AKEL) → Data (SQLite) | 332 -| **LLM Abstraction Layer** | ✅ Implemented | AI SDK supports multiple providers with failover | 333 -| **PostgreSQL Primary DB** | ⚠️ Different | Using SQLite for simplicity (acceptable for POC) | 334 -| **Redis Caching** | ❌ Missing | No caching layer | 335 -| **S3 Archival** | ❌ Missing | No long-term storage | 336 -| **Background Jobs** | ❌ Missing | No scheduler for source updates, cache warming | 337 -| **Quality Monitoring** | ⚠️ Partial | LLM call counting exists, but no anomaly detection | 441 +| Spec Feature | POC1 Status | Gap Description | 442 +| |-|-| 443 +| **Risk Tier Publication Rules** | ❌ Missing | All results published immediately regardless of tier | 444 +| **Human Review Queue** | ❌ Missing | No review workflow | 445 +| **AI-Generated Labeling** | ⚠️ Partial | Results show "AI analysis" but no formal labeling system | 446 +| **Audit Rate Sampling** | ❌ Missing | No sampling audits | 338 338 448 +---- 339 339 340 -== =4.4Publication&Review Gaps===450 +== 5. Optimization Recommendations == 341 341 452 +=== 5.1 Cost Optimizations === 342 342 343 -| Spec Feature | POC1 Status | Gap Description | 344 -|--------------|-------------|-----------------| 345 -| **Risk Tier Publication Rules** | ❌ Missing | All results published immediately regardless of tier | 346 -| **Human Review Queue** | ❌ Missing | No review workflow | 347 -| **AI-Generated Labeling** | ⚠️ Partial | Results show "AI analysis" but no formal labeling system | 348 -| **Audit Rate Sampling** | ❌ Missing | No sampling audits | 349 - 350 ---- 351 - 352 - 353 -== 5. Optimization Recommendations== 354 - 355 - 356 - 357 -=== 5.1 Cost Optimizations=== 358 - 359 - 360 360 {{mermaid}} 361 361 pie title Current LLM Cost Distribution (Estimated per Analysis) 362 362 "Step 1: Understand" : 15 ... ... @@ -364,18 +364,16 @@ 364 364 "Step 3: Verdicts" : 25 365 365 {{/mermaid}} 366 366 367 -| Optimization | Estimated Savings | Implementation Effort | 368 -| --------------|-------------------|----------------------|369 -| **Cache claim understanding** | 30-50% on repeated claims | Medium | 370 -| **Use Haiku for fact extraction** | 40% on Step 2 costs | Low (config change) | 371 -| **Batch fact extraction** | 20% fewer API calls | Medium | 372 -| **Skip search for known claims** | 50%+ for cached claims | High (needs claim DB) | 373 -| **Reduce max iterations** | Linear reduction | Low (config change) | 461 +| Optimization | Estimated Savings | Implementation Effort | 462 +| |-| | 463 +| **Cache claim understanding** | 30-50% on repeated claims | Medium | 464 +| **Use Haiku for fact extraction** | 40% on Step 2 costs | Low (config change) | 465 +| **Batch fact extraction** | 20% fewer API calls | Medium | 466 +| **Skip search for known claims** | 50%+ for cached claims | High (needs claim DB) | 467 +| **Reduce max iterations** | Linear reduction | Low (config change) | 374 374 469 +=== 5.2 Timing Optimizations === 375 375 376 -=== 5.2 Timing Optimizations=== 377 - 378 - 379 379 {{mermaid}} 380 380 gantt 381 381 title Current Analysis Timeline (Typical) ... ... @@ -401,18 +401,16 @@ 401 401 Generate Verdicts :b5, after b4, 10s 402 402 {{/mermaid}} 403 403 404 -| Optimization | Time Savings | Notes | 405 -| --------------|--------------|-------|406 -| **Parallel source fetching** | Already implemented | Currently fetches 3 sources in parallel | 407 -| **Streaming LLM responses** | 20-30% perceived | User sees progress faster | 408 -| **Search query batching** | 10-15% | Send multiple queries to search API | 409 -| **Reduce prompt size** | 5-10% per call | Optimize system prompts | 410 -| **Use faster models for extraction** | 30-40% on Step 2 | Claude Haiku vs Sonnet | 496 +| Optimization | Time Savings | Notes | 497 +| | |-| 498 +| **Parallel source fetching** | Already implemented | Currently fetches 3 sources in parallel | 499 +| **Streaming LLM responses** | 20-30% perceived | User sees progress faster | 500 +| **Search query batching** | 10-15% | Send multiple queries to search API | 501 +| **Reduce prompt size** | 5-10% per call | Optimize system prompts | 502 +| **Use faster models for extraction** | 30-40% on Step 2 | Claude Haiku vs Sonnet | 411 411 504 +=== 5.3 Priority Recommendations === 412 412 413 -=== 5.3 Priority Recommendations=== 414 - 415 - 416 416 1. **HIGH PRIORITY - Implement Claim Caching** 417 417 - Cache claim verdicts by content hash 418 418 - Reduces costs for repeated/similar claims ... ... @@ -428,16 +428,12 @@ 428 428 - Cache search results (1h TTL) 429 429 - Reduces external API calls 430 430 431 ---- 521 +---- 432 432 523 +== 6. Separated Verdict Architecture Proposal == 433 433 434 -== 6. Separated Verdict ArchitectureProposal==525 +=== 6.1 Current Architecture === 435 435 436 - 437 - 438 -=== 6.1 Current Architecture=== 439 - 440 - 441 441 {{mermaid}} 442 442 flowchart LR 443 443 subgraph Current["Current: Monolithic Analysis"] ... ... @@ -453,10 +453,8 @@ 453 453 - No caching of individual claim verdicts 454 454 - Article verdict tightly coupled to claim extraction 455 455 542 +=== 6.2 Proposed Separated Architecture === 456 456 457 -=== 6.2 Proposed Separated Architecture=== 458 - 459 - 460 460 {{mermaid}} 461 461 flowchart TB 462 462 subgraph Input["Input Processing"] ... ... @@ -509,30 +509,25 @@ 509 509 class CONTEXT,ARTICLE_VERDICT dynamic 510 510 {{/mermaid}} 511 511 596 +=== 6.3 Benefits Analysis === 512 512 513 -=== 6.3 Benefits Analysis=== 598 +| Benefit | Impact | Rationale | 599 +|-| |-| 600 +| **Cost Reduction** | 40-70% for repeated claims | Many articles share common claims (e.g., "COVID vaccines are safe") | 601 +| **Faster Analysis** | 50%+ for cached claims | Skip research + LLM calls for known claims | 602 +| **Consistency** | High | Same claim always gets same verdict (until cache expires) | 603 +| **Freshness Control** | Configurable TTL | Balance consistency vs. new evidence | 604 +| **Scalability** | Linear improvement | More users = higher cache hit rate | 514 514 515 - 516 -| Benefit | Impact | Rationale | 517 -|---------|--------|-----------| 518 -| **Cost Reduction** | 40-70% for repeated claims | Many articles share common claims (e.g., "COVID vaccines are safe") | 519 -| **Faster Analysis** | 50%+ for cached claims | Skip research + LLM calls for known claims | 520 -| **Consistency** | High | Same claim always gets same verdict (until cache expires) | 521 -| **Freshness Control** | Configurable TTL | Balance consistency vs. new evidence | 522 -| **Scalability** | Linear improvement | More users = higher cache hit rate | 523 - 524 - 525 525 === 6.4 Implementation Considerations === 526 526 527 527 **Claim Hashing Strategy:** 528 -{{code language="typescript"}} 529 -function getClaimHash(claim: string): string { 609 +{{code language="typescript"}}function getClaimHash(claim: string): string { 530 530 // Normalize: lowercase, remove punctuation, stem words 531 531 const normalized = normalize(claim); 532 532 // Hash for cache key 533 533 return crypto.createHash('sha256').update(normalized).digest('hex').slice(0, 16); 534 -} 535 -{{/code}} 614 +}{{/code}} 536 536 537 537 **Cache Invalidation Triggers:** 538 538 - TTL expiration (default 7 days) ... ... @@ -545,7 +545,7 @@ 545 545 - Same claims in different article contexts may yield different article verdicts 546 546 - Example: "Vaccines are safe" + "Vaccines cause autism" → article may be misleading even if first claim is true 547 547 548 -### 6.5 Recommendation 627 +### 6.5 Recommendation## 549 549 550 550 **YES, separating is beneficial** with the following caveats: 551 551 ... ... @@ -561,23 +561,19 @@ 561 561 - Phase 2: Semantic similarity caching (embedding-based) 562 562 - Phase 3: Federated claim sharing across instances 563 563 564 ---- 643 +---- 565 565 645 +== 7. Summary == 566 566 567 -== 7.Summary==647 +=== Current State === 568 568 569 - 570 - 571 -=== Current State=== 572 - 573 573 - POC1 implements core AKEL pipeline successfully 574 574 - Claim dependency tracking is implemented 575 575 - Multiple LLM providers supported 576 576 - No persistent claim storage or caching 577 577 654 +=== Key Gaps from Specification === 578 578 579 -=== Key Gaps from Specification=== 580 - 581 581 - No scenario extraction 582 582 - No user/role system 583 583 - No audit trail ... ... @@ -584,13 +584,10 @@ 584 584 - No source track record updates 585 585 - No review queue 586 586 662 +=== Recommended Next Steps === 587 587 588 -=== Recommended Next Steps=== 589 - 590 590 1. Implement claim caching layer 591 591 2. Separate claim vs article verdict generation 592 592 3. Add Redis for source/search caching 593 593 4. Implement tiered model selection 594 594 5. Add basic audit logging 595 - 596 -