Last modified by Robert Schaub on 2026/02/08 08:12

From version 7.1
edited by Robert Schaub
on 2026/01/02 10:12
Change comment: There is no comment for this version
To version 1.1
edited by Robert Schaub
on 2026/01/02 09:59
Change comment: There is no comment for this version

Summary

Details

Page properties
Content
... ... @@ -1,12 +2,14 @@
1 -= FactHarbor POC1 Architecture Analysis =
2 2  
2 += FactHarbor POC1 Architecture Analysis=
3 +
4 +
3 3  **Version:** 2.6.17
4 4  **Analysis Date:** January 2026
5 5  **Document Purpose:** Technical diagrams, gap analysis, and optimization recommendations
6 6  
7 -----
9 +---
8 8  
9 -== 1. AKEL Flow Diagram (with LLM and WebSearch Interactions) ==
11 +== 1. AKEL Flow Diagram (with LLM and WebSearch Interactions)==
10 10  
11 11  
12 12  {{mermaid}}
... ... @@ -90,123 +90,12 @@
90 90   class UNDERSTAND,DECIDE,FETCHSRC,EXTRACT,VERDICT,REPORT step
91 91  {{/mermaid}}
92 92  
93 -----
95 +---
94 94  
95 -== 2. ERD Data Model (Current POC1 Implementation) ==
96 96  
97 -**Data Objects ERD**
98 +== 2. ERD Data Model (Current POC1 Implementation)==
98 98  
99 -{{mermaid}}
100 -erDiagram
101 - ARTICLE ||--o{ CLAIM : "contains"
102 - ARTICLE ||--|| ARTICLE_VERDICT : "has"
103 - CLAIM ||--|| CLAIM_VERDICT : "has"
104 - CLAIM ||--o{ CLAIM : "depends on"
105 - CLAIM_VERDICT }o--o{ EVIDENCE : "supported by"
106 - SOURCE ||--o{ EVIDENCE : "provides"
107 - ARTICLE ||--o{ SOURCE : "references"
108 108  
109 - ARTICLE {
110 - string id PK "Unique identifier (job ID)"
111 - string inputType "text | url"
112 - string inputValue "Original URL or text"
113 - string articleThesis "Main argument/thesis"
114 - string detectedInputType "question | claim | article"
115 - boolean isQuestion "True if input is a question"
116 - datetime createdAt "Analysis timestamp"
117 - datetime updatedAt "Last update"
118 - json distinctProceedings "Legal proceedings if any"
119 - boolean hasMultipleProceedings "Multi-proceeding flag"
120 - string proceedingContext "Context for proceedings"
121 - json logicalFallacies "Detected fallacies array"
122 - boolean isPseudoscience "Pseudoscience detection"
123 - string_array pseudoscienceCategories "Categories if detected"
124 - int llmCalls "Total LLM API calls"
125 - json searchQueries "All search queries performed"
126 - string schemaVersion "e.g. 2.6.17"
127 - }
128 -
129 - CLAIM {
130 - string id PK "SC1, SC2, C1, etc."
131 - string articleId FK "Parent article"
132 - string text "The claim statement"
133 - string type "legal | procedural | factual | evaluative"
134 - string claimRole "attribution | source | timing | core"
135 - string_array dependsOn "IDs of prerequisite claims"
136 - string_array keyEntities "Named entities in claim"
137 - boolean isCentral "Is this a central claim?"
138 - string relatedProceedingId "Linked proceeding if any"
139 - int startOffset "Position in original text"
140 - int endOffset "End position in original text"
141 - string approximatePosition "Descriptive position"
142 - }
143 -
144 - CLAIM_VERDICT {
145 - string id PK "Same as claim ID"
146 - string claimId FK "Reference to claim"
147 - string llmVerdict "WELL-SUPPORTED | PARTIALLY-SUPPORTED | UNCERTAIN | REFUTED"
148 - string verdict "True | Mostly True | Leaning True | Unverified | Leaning False | Mostly False | False"
149 - int confidence "0-100 LLM confidence"
150 - int truthPercentage "0-100 calibrated truth score"
151 - string riskTier "A (high) | B (medium) | C (low)"
152 - string reasoning "Explanation of verdict"
153 - string_array supportingFactIds "Evidence IDs supporting this"
154 - boolean dependencyFailed "True if prerequisite failed"
155 - string_array failedDependencies "Which deps failed"
156 - string highlightColor "green | light-green | yellow | orange | dark-orange | red | dark-red"
157 - boolean isPseudoscience "Pseudoscience flag"
158 - string escalationReason "Why verdict was escalated"
159 - }
160 -
161 - ARTICLE_VERDICT {
162 - string id PK "Same as article ID"
163 - string articleId FK "Reference to article"
164 - string llmArticleVerdict "Original LLM verdict"
165 - int llmArticleConfidence "Original LLM confidence"
166 - string articleVerdict "True | Mostly True | Leaning True | Unverified | Leaning False | Mostly False | False"
167 - int articleTruthPercentage "0-100 calibrated score"
168 - string articleVerdictReason "Why verdict differs from claims avg"
169 - int claimsAverageTruthPercentage "Average of claim verdicts"
170 - string claimsAverageVerdict "7-point average verdict"
171 - int claimsTotal "Total claims analyzed"
172 - int claimsSupported "Claims with truth >= 72%"
173 - int claimsUncertain "Claims with truth 43-71%"
174 - int claimsRefuted "Claims with truth < 43%"
175 - int centralClaimsTotal "Number of central claims"
176 - int centralClaimsSupported "Central claims supported"
177 - }
178 -
179 - EVIDENCE {
180 - string id PK "S1-F1, S1-F2 format"
181 - string sourceId FK "Reference to source"
182 - string claimId FK "Optional: specific claim this supports"
183 - string fact "The factual statement extracted"
184 - string category "legal_provision | evidence | expert_quote | statistic | event | criticism"
185 - string specificity "high | medium"
186 - string sourceExcerpt "Original text excerpt"
187 - string relatedProceedingId "Linked proceeding if any"
188 - boolean isContestedClaim "Is this a contested assertion"
189 - string claimSource "Who made contested claim"
190 - }
191 -
192 - SOURCE {
193 - string id PK "S1, S2, etc."
194 - string articleId FK "Parent article"
195 - string url "Full URL"
196 - string title "Page/document title"
197 - string domain "Extracted domain"
198 - int trackRecordScore "0-100 reliability score or null"
199 - string fullText "Extracted content"
200 - datetime fetchedAt "When content was fetched"
201 - string category "news | academic | government | legal"
202 - boolean fetchSuccess "True if fetch succeeded"
203 - string searchQuery "Which query found this"
204 - string mimeType "text/html | application/pdf"
205 - }
206 -{{/mermaid}}
207 -
208 -**Data Usage ERD**
209 -
210 210  {{mermaid}}
211 211  erDiagram
212 212   JOB ||--o{ JOB_EVENT : "has"
... ... @@ -296,10 +296,12 @@
296 296   }
297 297  {{/mermaid}}
298 298  
299 -----
190 +---
300 300  
301 -== 3. Overall Architecture with Interactions ==
302 302  
193 +== 3. Overall Architecture with Interactions==
194 +
195 +
303 303  {{mermaid}}
304 304  flowchart TB
305 305   subgraph Client["🖥️ Client Layer"]
... ... @@ -393,64 +393,77 @@
393 393   class ANALYZE_API,JOBS_API,JOB_API,EVENTS_API,RUN_JOB api
394 394  {{/mermaid}}
395 395  
396 -----
289 +---
397 397  
398 -== 4. Specification vs Implementation Gap Analysis ==
399 399  
400 -=== 4.1 Data Model Gaps ===
292 +== 4. Specification vs Implementation Gap Analysis==
401 401  
402 -| Specification Entity | POC1 Status | Gap Description |
403 -|-|-|-|
404 -| **Claim** | ⚠️ Partial | No persistent storage; claims exist only in JSON result. Missing: `status`, `confidence_score`, `risk_score`, `completeness_score`, `version`, `views`, `edit_count` |
405 -| **Evidence** | ⚠️ Partial | Implemented as `ExtractedFact` but lacks: `supports` enum, proper `relevance_score` |
406 -| **Source** | ⚠️ Partial | `FetchedSource` exists but missing: `type` enum, `accuracy_history`, `correction_frequency`, weekly update scheduler |
407 -| **Scenario** | ❌ Missing | Not implemented. Claims are evaluated directly without scenario contexts |
408 -| **Verdict** | ⚠️ Partial | `ClaimVerdict` exists but missing: `likelihood_range`, `uncertainty_factors` array, proper `explanation_summary` |
409 -| **User** | ❌ Missing | No user authentication or role system |
410 -| **Edit** | ❌ Missing | No audit trail for changes |
411 411  
295 +
296 +=== 4.1 Data Model Gaps===
297 +
298 +
299 +| Specification Entity | POC1 Status | Gap Description |
300 +|---------------------|-------------|-----------------|
301 +| **Claim** | ⚠️ Partial | No persistent storage; claims exist only in JSON result. Missing: `status`, `confidence_score`, `risk_score`, `completeness_score`, `version`, `views`, `edit_count` |
302 +| **Evidence** | ⚠️ Partial | Implemented as `ExtractedFact` but lacks: `supports` enum, proper `relevance_score` |
303 +| **Source** | ⚠️ Partial | `FetchedSource` exists but missing: `type` enum, `accuracy_history`, `correction_frequency`, weekly update scheduler |
304 +| **Scenario** | ❌ Missing | Not implemented. Claims are evaluated directly without scenario contexts |
305 +| **Verdict** | ⚠️ Partial | `ClaimVerdict` exists but missing: `likelihood_range`, `uncertainty_factors` array, proper `explanation_summary` |
306 +| **User** | ❌ Missing | No user authentication or role system |
307 +| **Edit** | ❌ Missing | No audit trail for changes |
308 +
309 +
412 412  === 4.2 AKEL Component Gaps ===
413 413  
414 -| Spec Component | POC1 Status | Gap Description |
415 -| |-|-|
416 -| **AKEL Orchestrator** | ✅ Implemented | `runAnalysis()` function serves this role |
417 -| **Claim Extractor** | ✅ Implemented | `understandClaim()` with claim role/dependency tracking |
418 -| **Claim Classifier** | ⚠️ Partial | Risk tier (A/B/C) assigned, but no domain classification |
419 -| **Scenario Generator** | ❌ Missing | Claims evaluated without scenario extraction |
420 -| **Evidence Summarizer** | ✅ Implemented | `extractFacts()` function |
421 -| **Contradiction Detector** | ⚠️ Partial | `isContestedClaim` flag exists but no active contradiction search |
422 -| **Quality Gate Validator** | ❌ Missing | No source quality gates, no mandatory checks |
423 -| **Audit Sampling Scheduler** | ❌ Missing | No audit system |
424 -| **Embedding Handler** | ❌ Missing | Not needed for POC |
425 -| **Federation Sync** | ❌ Missing | Not needed for POC |
312 +| Spec Component | POC1 Status | Gap Description |
313 +|----------------|-------------|-----------------|
314 +| **AKEL Orchestrator** | ✅ Implemented | `runAnalysis()` function serves this role |
315 +| **Claim Extractor** | ✅ Implemented | `understandClaim()` with claim role/dependency tracking |
316 +| **Claim Classifier** | ⚠️ Partial | Risk tier (A/B/C) assigned, but no domain classification |
317 +| **Scenario Generator** | ❌ Missing | Claims evaluated without scenario extraction |
318 +| **Evidence Summarizer** | ✅ Implemented | `extractFacts()` function |
319 +| **Contradiction Detector** | ⚠️ Partial | `isContestedClaim` flag exists but no active contradiction search |
320 +| **Quality Gate Validator** | ❌ Missing | No source quality gates, no mandatory checks |
321 +| **Audit Sampling Scheduler** | ❌ Missing | No audit system |
322 +| **Embedding Handler** | ❌ Missing | Not needed for POC |
323 +| **Federation Sync** | ❌ Missing | Not needed for POC |
426 426  
427 -=== 4.3 Architecture Gaps ===
428 428  
429 -| Spec Requirement | POC1 Status | Gap Description |
430 -| |-|-|
431 -| **Three-Layer Architecture** | ✅ Implemented | Interface (Next.js) → Processing (AKEL) → Data (SQLite) |
432 -| **LLM Abstraction Layer** | ✅ Implemented | AI SDK supports multiple providers with failover |
433 -| **PostgreSQL Primary DB** | ⚠️ Different | Using SQLite for simplicity (acceptable for POC) |
434 -| **Redis Caching** | ❌ Missing | No caching layer |
435 -| **S3 Archival** | ❌ Missing | No long-term storage |
436 -| **Background Jobs** | ❌ Missing | No scheduler for source updates, cache warming |
437 -| **Quality Monitoring** | ⚠️ Partial | LLM call counting exists, but no anomaly detection |
326 +=== 4.3 Architecture Gaps===
438 438  
439 -=== 4.4 Publication & Review Gaps ===
440 440  
441 -| Spec Feature | POC1 Status | Gap Description |
442 -| |-|-|
443 -| **Risk Tier Publication Rules** | ❌ Missing | All results published immediately regardless of tier |
444 -| **Human Review Queue** | ❌ Missing | No review workflow |
445 -| **AI-Generated Labeling** | ⚠️ Partial | Results show "AI analysis" but no formal labeling system |
446 -| **Audit Rate Sampling** | ❌ Missing | No sampling audits |
329 +| Spec Requirement | POC1 Status | Gap Description |
330 +|------------------|-------------|-----------------|
331 +| **Three-Layer Architecture** | ✅ Implemented | Interface (Next.js) → Processing (AKEL) → Data (SQLite) |
332 +| **LLM Abstraction Layer** | ✅ Implemented | AI SDK supports multiple providers with failover |
333 +| **PostgreSQL Primary DB** | ⚠️ Different | Using SQLite for simplicity (acceptable for POC) |
334 +| **Redis Caching** | ❌ Missing | No caching layer |
335 +| **S3 Archival** | ❌ Missing | No long-term storage |
336 +| **Background Jobs** | ❌ Missing | No scheduler for source updates, cache warming |
337 +| **Quality Monitoring** | ⚠️ Partial | LLM call counting exists, but no anomaly detection |
447 447  
448 -----
449 449  
450 -== 5. Optimization Recommendations ==
340 +=== 4.4 Publication & Review Gaps===
451 451  
452 -=== 5.1 Cost Optimizations ===
453 453  
343 +| Spec Feature | POC1 Status | Gap Description |
344 +|--------------|-------------|-----------------|
345 +| **Risk Tier Publication Rules** | ❌ Missing | All results published immediately regardless of tier |
346 +| **Human Review Queue** | ❌ Missing | No review workflow |
347 +| **AI-Generated Labeling** | ⚠️ Partial | Results show "AI analysis" but no formal labeling system |
348 +| **Audit Rate Sampling** | ❌ Missing | No sampling audits |
349 +
350 +---
351 +
352 +
353 +== 5. Optimization Recommendations==
354 +
355 +
356 +
357 +=== 5.1 Cost Optimizations===
358 +
359 +
454 454  {{mermaid}}
455 455  pie title Current LLM Cost Distribution (Estimated per Analysis)
456 456   "Step 1: Understand" : 15
... ... @@ -458,16 +458,18 @@
458 458   "Step 3: Verdicts" : 25
459 459  {{/mermaid}}
460 460  
461 -| Optimization | Estimated Savings | Implementation Effort |
462 -| |-| |
463 -| **Cache claim understanding** | 30-50% on repeated claims | Medium |
464 -| **Use Haiku for fact extraction** | 40% on Step 2 costs | Low (config change) |
465 -| **Batch fact extraction** | 20% fewer API calls | Medium |
466 -| **Skip search for known claims** | 50%+ for cached claims | High (needs claim DB) |
467 -| **Reduce max iterations** | Linear reduction | Low (config change) |
367 +| Optimization | Estimated Savings | Implementation Effort |
368 +|--------------|-------------------|----------------------|
369 +| **Cache claim understanding** | 30-50% on repeated claims | Medium |
370 +| **Use Haiku for fact extraction** | 40% on Step 2 costs | Low (config change) |
371 +| **Batch fact extraction** | 20% fewer API calls | Medium |
372 +| **Skip search for known claims** | 50%+ for cached claims | High (needs claim DB) |
373 +| **Reduce max iterations** | Linear reduction | Low (config change) |
468 468  
469 -=== 5.2 Timing Optimizations ===
470 470  
376 +=== 5.2 Timing Optimizations===
377 +
378 +
471 471  {{mermaid}}
472 472  gantt
473 473   title Current Analysis Timeline (Typical)
... ... @@ -493,16 +493,18 @@
493 493   Generate Verdicts :b5, after b4, 10s
494 494  {{/mermaid}}
495 495  
496 -| Optimization | Time Savings | Notes |
497 -| | |-|
498 -| **Parallel source fetching** | Already implemented | Currently fetches 3 sources in parallel |
499 -| **Streaming LLM responses** | 20-30% perceived | User sees progress faster |
500 -| **Search query batching** | 10-15% | Send multiple queries to search API |
501 -| **Reduce prompt size** | 5-10% per call | Optimize system prompts |
502 -| **Use faster models for extraction** | 30-40% on Step 2 | Claude Haiku vs Sonnet |
404 +| Optimization | Time Savings | Notes |
405 +|--------------|--------------|-------|
406 +| **Parallel source fetching** | Already implemented | Currently fetches 3 sources in parallel |
407 +| **Streaming LLM responses** | 20-30% perceived | User sees progress faster |
408 +| **Search query batching** | 10-15% | Send multiple queries to search API |
409 +| **Reduce prompt size** | 5-10% per call | Optimize system prompts |
410 +| **Use faster models for extraction** | 30-40% on Step 2 | Claude Haiku vs Sonnet |
503 503  
504 -=== 5.3 Priority Recommendations ===
505 505  
413 +=== 5.3 Priority Recommendations===
414 +
415 +
506 506  1. **HIGH PRIORITY - Implement Claim Caching**
507 507   - Cache claim verdicts by content hash
508 508   - Reduces costs for repeated/similar claims
... ... @@ -518,12 +518,16 @@
518 518   - Cache search results (1h TTL)
519 519   - Reduces external API calls
520 520  
521 -----
431 +---
522 522  
523 -== 6. Separated Verdict Architecture Proposal ==
524 524  
525 -=== 6.1 Current Architecture ===
434 +== 6. Separated Verdict Architecture Proposal==
526 526  
436 +
437 +
438 +=== 6.1 Current Architecture===
439 +
440 +
527 527  {{mermaid}}
528 528  flowchart LR
529 529   subgraph Current["Current: Monolithic Analysis"]
... ... @@ -539,8 +539,10 @@
539 539  - No caching of individual claim verdicts
540 540  - Article verdict tightly coupled to claim extraction
541 541  
542 -=== 6.2 Proposed Separated Architecture ===
543 543  
457 +=== 6.2 Proposed Separated Architecture===
458 +
459 +
544 544  {{mermaid}}
545 545  flowchart TB
546 546   subgraph Input["Input Processing"]
... ... @@ -593,25 +593,30 @@
593 593   class CONTEXT,ARTICLE_VERDICT dynamic
594 594  {{/mermaid}}
595 595  
596 -=== 6.3 Benefits Analysis ===
597 597  
598 -| Benefit | Impact | Rationale |
599 -|-| |-|
600 -| **Cost Reduction** | 40-70% for repeated claims | Many articles share common claims (e.g., "COVID vaccines are safe") |
601 -| **Faster Analysis** | 50%+ for cached claims | Skip research + LLM calls for known claims |
602 -| **Consistency** | High | Same claim always gets same verdict (until cache expires) |
603 -| **Freshness Control** | Configurable TTL | Balance consistency vs. new evidence |
604 -| **Scalability** | Linear improvement | More users = higher cache hit rate |
513 +=== 6.3 Benefits Analysis===
605 605  
515 +
516 +| Benefit | Impact | Rationale |
517 +|---------|--------|-----------|
518 +| **Cost Reduction** | 40-70% for repeated claims | Many articles share common claims (e.g., "COVID vaccines are safe") |
519 +| **Faster Analysis** | 50%+ for cached claims | Skip research + LLM calls for known claims |
520 +| **Consistency** | High | Same claim always gets same verdict (until cache expires) |
521 +| **Freshness Control** | Configurable TTL | Balance consistency vs. new evidence |
522 +| **Scalability** | Linear improvement | More users = higher cache hit rate |
523 +
524 +
606 606  === 6.4 Implementation Considerations ===
607 607  
608 608  **Claim Hashing Strategy:**
609 -{{code language="typescript"}}function getClaimHash(claim: string): string {
528 +{{code language="typescript"}}
529 +function getClaimHash(claim: string): string {
610 610   // Normalize: lowercase, remove punctuation, stem words
611 611   const normalized = normalize(claim);
612 612   // Hash for cache key
613 613   return crypto.createHash('sha256').update(normalized).digest('hex').slice(0, 16);
614 -}{{/code}}
534 +}
535 +{{/code}}
615 615  
616 616  **Cache Invalidation Triggers:**
617 617  - TTL expiration (default 7 days)
... ... @@ -624,7 +624,7 @@
624 624  - Same claims in different article contexts may yield different article verdicts
625 625  - Example: "Vaccines are safe" + "Vaccines cause autism" → article may be misleading even if first claim is true
626 626  
627 -### 6.5 Recommendation##
548 +### 6.5 Recommendation
628 628  
629 629  **YES, separating is beneficial** with the following caveats:
630 630  
... ... @@ -640,19 +640,23 @@
640 640   - Phase 2: Semantic similarity caching (embedding-based)
641 641   - Phase 3: Federated claim sharing across instances
642 642  
643 -----
564 +---
644 644  
645 -== 7. Summary ==
646 646  
647 -=== Current State ===
567 +== 7. Summary==
648 648  
569 +
570 +
571 +=== Current State===
572 +
649 649  - POC1 implements core AKEL pipeline successfully
650 650  - Claim dependency tracking is implemented
651 651  - Multiple LLM providers supported
652 652  - No persistent claim storage or caching
653 653  
654 -=== Key Gaps from Specification ===
655 655  
579 +=== Key Gaps from Specification===
580 +
656 656  - No scenario extraction
657 657  - No user/role system
658 658  - No audit trail
... ... @@ -659,10 +659,13 @@
659 659  - No source track record updates
660 660  - No review queue
661 661  
662 -=== Recommended Next Steps ===
663 663  
588 +=== Recommended Next Steps===
589 +
664 664  1. Implement claim caching layer
665 665  2. Separate claim vs article verdict generation
666 666  3. Add Redis for source/search caching
667 667  4. Implement tiered model selection
668 668  5. Add basic audit logging
595 +
596 +