Changes for page FactHarbor POC1 Architecture Analysis 1.Jan.26

Last modified by Robert Schaub on 2026/02/08 08:12

From 11.1 to 11.2

From version 1.1

edited by Robert Schaub
on 2026/01/02 09:59

Change comment: There is no comment for this version

To version 11.1

edited by Robert Schaub
on 2026/01/02 10:14

Change comment: There is no comment for this version

Raw
Rendered

Summary

Page properties (2 modified, 0 added, 0 removed)

Details

Page properties

Title

@@ -1,1 +1,1 @@
--FactHarbor POC1 Architecture Analysis
++FactHarbor POC1 Architecture Analysis 1.Jan.26

Content

@@ -1,14 +1,12 @@
++= FactHarbor POC1 Architecture Analysis =
--= FactHarbor POC1 Architecture Analysis=
--
--
  **Version:** 2.6.17
  **Analysis Date:** January 2026
  **Document Purpose:** Technical diagrams, gap analysis, and optimization recommendations
-----
++----
--== 1. AKEL Flow Diagram (with LLM and WebSearch Interactions)==
++== 1. AKEL Flow Diagram (with LLM and WebSearch Interactions) ==
  {{mermaid}}
@@ -92,12 +92,123 @@
      class UNDERSTAND,DECIDE,FETCHSRC,EXTRACT,VERDICT,REPORT step
  {{/mermaid}}
-----
++----
++== 2. ERD Data Model (Current POC1 Implementation) ==
--== 2. ERD Data Model (Current POC1 Implementation)==
++**Data Objects ERD**
++{{mermaid}}
++erDiagram
++    ARTICLE ||--o{ CLAIM : "contains"
++    ARTICLE ||--|| ARTICLE_VERDICT : "has"
++    CLAIM ||--|| CLAIM_VERDICT : "has"
++    CLAIM ||--o{ CLAIM : "depends on"
++    CLAIM_VERDICT }o--o{ EVIDENCE : "supported by"
++    SOURCE ||--o{ EVIDENCE : "provides"
++    ARTICLE ||--o{ SOURCE : "references"
++    ARTICLE {
++        string id PK "Unique identifier (job ID)"
++        string inputType "text | url"
++        string inputValue "Original URL or text"
++        string articleThesis "Main argument/thesis"
++        string detectedInputType "question | claim | article"
++        boolean isQuestion "True if input is a question"
++        datetime createdAt "Analysis timestamp"
++        datetime updatedAt "Last update"
++        json distinctProceedings "Legal proceedings if any"
++        boolean hasMultipleProceedings "Multi-proceeding flag"
++        string proceedingContext "Context for proceedings"
++        json logicalFallacies "Detected fallacies array"
++        boolean isPseudoscience "Pseudoscience detection"
++        string_array pseudoscienceCategories "Categories if detected"
++        int llmCalls "Total LLM API calls"
++        json searchQueries "All search queries performed"
++        string schemaVersion "e.g. 2.6.17"
++    }
++
++    CLAIM {
++        string id PK "SC1, SC2, C1, etc."
++        string articleId FK "Parent article"
++        string text "The claim statement"
++        string type "legal | procedural | factual | evaluative"
++        string claimRole "attribution | source | timing | core"
++        string_array dependsOn "IDs of prerequisite claims"
++        string_array keyEntities "Named entities in claim"
++        boolean isCentral "Is this a central claim?"
++        string relatedProceedingId "Linked proceeding if any"
++        int startOffset "Position in original text"
++        int endOffset "End position in original text"
++        string approximatePosition "Descriptive position"
++    }
++
++    CLAIM_VERDICT {
++        string id PK "Same as claim ID"
++        string claimId FK "Reference to claim"
++        string llmVerdict "WELL-SUPPORTED | PARTIALLY-SUPPORTED | UNCERTAIN | REFUTED"
++        string verdict "True | Mostly True | Leaning True | Unverified | Leaning False | Mostly False | False"
++        int confidence "0-100 LLM confidence"
++        int truthPercentage "0-100 calibrated truth score"
++        string riskTier "A (high) | B (medium) | C (low)"
++        string reasoning "Explanation of verdict"
++        string_array supportingFactIds "Evidence IDs supporting this"
++        boolean dependencyFailed "True if prerequisite failed"
++        string_array failedDependencies "Which deps failed"
++        string highlightColor "green | light-green | yellow | orange | dark-orange | red | dark-red"
++        boolean isPseudoscience "Pseudoscience flag"
++        string escalationReason "Why verdict was escalated"
++    }
++
++    ARTICLE_VERDICT {
++        string id PK "Same as article ID"
++        string articleId FK "Reference to article"
++        string llmArticleVerdict "Original LLM verdict"
++        int llmArticleConfidence "Original LLM confidence"
++        string articleVerdict "True | Mostly True | Leaning True | Unverified | Leaning False | Mostly False | False"
++        int articleTruthPercentage "0-100 calibrated score"
++        string articleVerdictReason "Why verdict differs from claims avg"
++        int claimsAverageTruthPercentage "Average of claim verdicts"
++        string claimsAverageVerdict "7-point average verdict"
++        int claimsTotal "Total claims analyzed"
++        int claimsSupported "Claims with truth >= 72%"
++        int claimsUncertain "Claims with truth 43-71%"
++        int claimsRefuted "Claims with truth < 43%"
++        int centralClaimsTotal "Number of central claims"
++        int centralClaimsSupported "Central claims supported"
++    }
++
++    EVIDENCE {
++        string id PK "S1-F1, S1-F2 format"
++        string sourceId FK "Reference to source"
++        string claimId FK "Optional: specific claim this supports"
++        string fact "The factual statement extracted"
++        string category "legal_provision | evidence | expert_quote | statistic | event | criticism"
++        string specificity "high | medium"
++        string sourceExcerpt "Original text excerpt"
++        string relatedProceedingId "Linked proceeding if any"
++        boolean isContestedClaim "Is this a contested assertion"
++        string claimSource "Who made contested claim"
++    }
++
++    SOURCE {
++        string id PK "S1, S2, etc."
++        string articleId FK "Parent article"
++        string url "Full URL"
++        string title "Page/document title"
++        string domain "Extracted domain"
++        int trackRecordScore "0-100 reliability score or null"
++        string fullText "Extracted content"
++        datetime fetchedAt "When content was fetched"
++        string category "news | academic | government | legal"
++        boolean fetchSuccess "True if fetch succeeded"
++        string searchQuery "Which query found this"
++        string mimeType "text/html | application/pdf"
++    }
++{{/mermaid}}
++
++**Data Usage ERD**
++
  {{mermaid}}
  erDiagram
      JOB ||--o{ JOB_EVENT : "has"
@@ -187,12 +187,10 @@
      }
  {{/mermaid}}
-----
++----
++== 3. Overall Architecture with Interactions ==
--== 3. Overall Architecture with Interactions==
--
--
  {{mermaid}}
  flowchart TB
      subgraph Client["🖥️ Client Layer"]
@@ -286,77 +286,64 @@
      class ANALYZE_API,JOBS_API,JOB_API,EVENTS_API,RUN_JOB api
  {{/mermaid}}
-----
++----
++== 4. Specification vs Implementation Gap Analysis ==
--== 4. Specification vs Implementation Gap Analysis==
++=== 4.1 Data Model Gaps ===
++| Specification Entity | POC1 Status | Gap Description |
++|-|-|-|
++| **Claim** | ⚠️ Partial | No persistent storage; claims exist only in JSON result. Missing: `status`, `confidence_score`, `risk_score`, `completeness_score`, `version`, `views`, `edit_count` |
++| **Evidence** | ⚠️ Partial | Implemented as `ExtractedFact` but lacks: `supports` enum, proper `relevance_score` |
++| **Source** | ⚠️ Partial | `FetchedSource` exists but missing: `type` enum, `accuracy_history`, `correction_frequency`, weekly update scheduler |
++| **Scenario** | ❌ Missing | Not implemented. Claims are evaluated directly without scenario contexts |
++| **Verdict** | ⚠️ Partial | `ClaimVerdict` exists but missing: `likelihood_range`, `uncertainty_factors` array, proper `explanation_summary` |
++| **User** | ❌ Missing | No user authentication or role system |
++| **Edit** | ❌ Missing | No audit trail for changes |
--
--=== 4.1 Data Model Gaps===
--
--
--| Specification Entity | POC1 Status | Gap Description |
--|---------------------|-------------|-----------------|
--| **Claim** | ⚠️ Partial | No persistent storage; claims exist only in JSON result. Missing: `status`, `confidence_score`, `risk_score`, `completeness_score`, `version`, `views`, `edit_count` |
--| **Evidence** | ⚠️ Partial | Implemented as `ExtractedFact` but lacks: `supports` enum, proper `relevance_score` |
--| **Source** | ⚠️ Partial | `FetchedSource` exists but missing: `type` enum, `accuracy_history`, `correction_frequency`, weekly update scheduler |
--| **Scenario** | ❌ Missing | Not implemented. Claims are evaluated directly without scenario contexts |
--| **Verdict** | ⚠️ Partial | `ClaimVerdict` exists but missing: `likelihood_range`, `uncertainty_factors` array, proper `explanation_summary` |
--| **User** | ❌ Missing | No user authentication or role system |
--| **Edit** | ❌ Missing | No audit trail for changes |
--
--
  === 4.2 AKEL Component Gaps ===
--| Spec Component | POC1 Status | Gap Description |
--|----------------|-------------|-----------------|
--| **AKEL Orchestrator** | ✅ Implemented | `runAnalysis()` function serves this role |
--| **Claim Extractor** | ✅ Implemented | `understandClaim()` with claim role/dependency tracking |
--| **Claim Classifier** | ⚠️ Partial | Risk tier (A/B/C) assigned, but no domain classification |
--| **Scenario Generator** | ❌ Missing | Claims evaluated without scenario extraction |
--| **Evidence Summarizer** | ✅ Implemented | `extractFacts()` function |
--| **Contradiction Detector** | ⚠️ Partial | `isContestedClaim` flag exists but no active contradiction search |
--| **Quality Gate Validator** | ❌ Missing | No source quality gates, no mandatory checks |
--| **Audit Sampling Scheduler** | ❌ Missing | No audit system |
--| **Embedding Handler** | ❌ Missing | Not needed for POC |
--| **Federation Sync** | ❌ Missing | Not needed for POC |
++| Spec Component | POC1 Status | Gap Description |
++| |-|-|
++| **AKEL Orchestrator** | ✅ Implemented | `runAnalysis()` function serves this role |
++| **Claim Extractor** | ✅ Implemented | `understandClaim()` with claim role/dependency tracking |
++| **Claim Classifier** | ⚠️ Partial | Risk tier (A/B/C) assigned, but no domain classification |
++| **Scenario Generator** | ❌ Missing | Claims evaluated without scenario extraction |
++| **Evidence Summarizer** | ✅ Implemented | `extractFacts()` function |
++| **Contradiction Detector** | ⚠️ Partial | `isContestedClaim` flag exists but no active contradiction search |
++| **Quality Gate Validator** | ❌ Missing | No source quality gates, no mandatory checks |
++| **Audit Sampling Scheduler** | ❌ Missing | No audit system |
++| **Embedding Handler** | ❌ Missing | Not needed for POC |
++| **Federation Sync** | ❌ Missing | Not needed for POC |
++=== 4.3 Architecture Gaps ===
--=== 4.3 Architecture Gaps===
++| Spec Requirement | POC1 Status | Gap Description |
++| |-|-|
++| **Three-Layer Architecture** | ✅ Implemented | Interface (Next.js) → Processing (AKEL) → Data (SQLite) |
++| **LLM Abstraction Layer** | ✅ Implemented | AI SDK supports multiple providers with failover |
++| **PostgreSQL Primary DB** | ⚠️ Different | Using SQLite for simplicity (acceptable for POC) |
++| **Redis Caching** | ❌ Missing | No caching layer |
++| **S3 Archival** | ❌ Missing | No long-term storage |
++| **Background Jobs** | ❌ Missing | No scheduler for source updates, cache warming |
++| **Quality Monitoring** | ⚠️ Partial | LLM call counting exists, but no anomaly detection |
++=== 4.4 Publication & Review Gaps ===
--| Spec Requirement | POC1 Status | Gap Description |
--|------------------|-------------|-----------------|
--| **Three-Layer Architecture** | ✅ Implemented | Interface (Next.js) → Processing (AKEL) → Data (SQLite) |
--| **LLM Abstraction Layer** | ✅ Implemented | AI SDK supports multiple providers with failover |
--| **PostgreSQL Primary DB** | ⚠️ Different | Using SQLite for simplicity (acceptable for POC) |
--| **Redis Caching** | ❌ Missing | No caching layer |
--| **S3 Archival** | ❌ Missing | No long-term storage |
--| **Background Jobs** | ❌ Missing | No scheduler for source updates, cache warming |
--| **Quality Monitoring** | ⚠️ Partial | LLM call counting exists, but no anomaly detection |
++| Spec Feature | POC1 Status | Gap Description |
++| |-|-|
++| **Risk Tier Publication Rules** | ❌ Missing | All results published immediately regardless of tier |
++| **Human Review Queue** | ❌ Missing | No review workflow |
++| **AI-Generated Labeling** | ⚠️ Partial | Results show "AI analysis" but no formal labeling system |
++| **Audit Rate Sampling** | ❌ Missing | No sampling audits |
++----
--=== 4.4 Publication & Review Gaps===
++== 5. Optimization Recommendations ==
++=== 5.1 Cost Optimizations ===
--| Spec Feature | POC1 Status | Gap Description |
--|--------------|-------------|-----------------|
--| **Risk Tier Publication Rules** | ❌ Missing | All results published immediately regardless of tier |
--| **Human Review Queue** | ❌ Missing | No review workflow |
--| **AI-Generated Labeling** | ⚠️ Partial | Results show "AI analysis" but no formal labeling system |
--| **Audit Rate Sampling** | ❌ Missing | No sampling audits |
--
-----
--
--
--== 5. Optimization Recommendations==
--
--
--
--=== 5.1 Cost Optimizations===
--
--
  {{mermaid}}
  pie title Current LLM Cost Distribution (Estimated per Analysis)
      "Step 1: Understand" : 15
@@ -364,18 +364,16 @@
      "Step 3: Verdicts" : 25
  {{/mermaid}}
--| Optimization | Estimated Savings | Implementation Effort |
--|--------------|-------------------|----------------------|
--| **Cache claim understanding** | 30-50% on repeated claims | Medium |
--| **Use Haiku for fact extraction** | 40% on Step 2 costs | Low (config change) |
--| **Batch fact extraction** | 20% fewer API calls | Medium |
--| **Skip search for known claims** | 50%+ for cached claims | High (needs claim DB) |
--| **Reduce max iterations** | Linear reduction | Low (config change) |
++| Optimization | Estimated Savings | Implementation Effort |
++| |-| |
++| **Cache claim understanding** | 30-50% on repeated claims | Medium |
++| **Use Haiku for fact extraction** | 40% on Step 2 costs | Low (config change) |
++| **Batch fact extraction** | 20% fewer API calls | Medium |
++| **Skip search for known claims** | 50%+ for cached claims | High (needs claim DB) |
++| **Reduce max iterations** | Linear reduction | Low (config change) |
++=== 5.2 Timing Optimizations ===
--=== 5.2 Timing Optimizations===
--
--
  {{mermaid}}
  gantt
      title Current Analysis Timeline (Typical)
@@ -401,18 +401,16 @@
      Generate Verdicts   :b5, after b4, 10s
  {{/mermaid}}
--| Optimization | Time Savings | Notes |
--|--------------|--------------|-------|
--| **Parallel source fetching** | Already implemented | Currently fetches 3 sources in parallel |
--| **Streaming LLM responses** | 20-30% perceived | User sees progress faster |
--| **Search query batching** | 10-15% | Send multiple queries to search API |
--| **Reduce prompt size** | 5-10% per call | Optimize system prompts |
--| **Use faster models for extraction** | 30-40% on Step 2 | Claude Haiku vs Sonnet |
++| Optimization | Time Savings | Notes |
++| | |-|
++| **Parallel source fetching** | Already implemented | Currently fetches 3 sources in parallel |
++| **Streaming LLM responses** | 20-30% perceived | User sees progress faster |
++| **Search query batching** | 10-15% | Send multiple queries to search API |
++| **Reduce prompt size** | 5-10% per call | Optimize system prompts |
++| **Use faster models for extraction** | 30-40% on Step 2 | Claude Haiku vs Sonnet |
++=== 5.3 Priority Recommendations ===
--=== 5.3 Priority Recommendations===
--
--
 . **HIGH PRIORITY - Implement Claim Caching**
     - Cache claim verdicts by content hash
     - Reduces costs for repeated/similar claims
@@ -428,16 +428,12 @@
     - Cache search results (1h TTL)
     - Reduces external API calls
-----
++----
++== 6. Separated Verdict Architecture Proposal ==
--== 6. Separated Verdict Architecture Proposal==
++=== 6.1 Current Architecture ===
--
--
--=== 6.1 Current Architecture===
--
--
  {{mermaid}}
  flowchart LR
      subgraph Current["Current: Monolithic Analysis"]
@@ -453,10 +453,8 @@
  - No caching of individual claim verdicts
  - Article verdict tightly coupled to claim extraction
++=== 6.2 Proposed Separated Architecture ===
--=== 6.2 Proposed Separated Architecture===
--
--
  {{mermaid}}
  flowchart TB
      subgraph Input["Input Processing"]
@@ -509,30 +509,25 @@
      class CONTEXT,ARTICLE_VERDICT dynamic
  {{/mermaid}}
++=== 6.3 Benefits Analysis ===
--=== 6.3 Benefits Analysis===
++| Benefit | Impact | Rationale |
++|-| |-|
++| **Cost Reduction** | 40-70% for repeated claims | Many articles share common claims (e.g., "COVID vaccines are safe") |
++| **Faster Analysis** | 50%+ for cached claims | Skip research + LLM calls for known claims |
++| **Consistency** | High | Same claim always gets same verdict (until cache expires) |
++| **Freshness Control** | Configurable TTL | Balance consistency vs. new evidence |
++| **Scalability** | Linear improvement | More users = higher cache hit rate |
--
--| Benefit | Impact | Rationale |
--|---------|--------|-----------|
--| **Cost Reduction** | 40-70% for repeated claims | Many articles share common claims (e.g., "COVID vaccines are safe") |
--| **Faster Analysis** | 50%+ for cached claims | Skip research + LLM calls for known claims |
--| **Consistency** | High | Same claim always gets same verdict (until cache expires) |
--| **Freshness Control** | Configurable TTL | Balance consistency vs. new evidence |
--| **Scalability** | Linear improvement | More users = higher cache hit rate |
--
--
  === 6.4 Implementation Considerations ===
  **Claim Hashing Strategy:**
--{{code language="typescript"}}
--function getClaimHash(claim: string): string {
++{{code language="typescript"}}function getClaimHash(claim: string): string {
    // Normalize: lowercase, remove punctuation, stem words
    const normalized = normalize(claim);
    // Hash for cache key
    return crypto.createHash('sha256').update(normalized).digest('hex').slice(0, 16);
--}
--{{/code}}
++}{{/code}}
  **Cache Invalidation Triggers:**
  - TTL expiration (default 7 days)
@@ -545,7 +545,7 @@
  - Same claims in different article contexts may yield different article verdicts
  - Example: "Vaccines are safe" + "Vaccines cause autism" → article may be misleading even if first claim is true
--### 6.5 Recommendation
++### 6.5 Recommendation##
  **YES, separating is beneficial** with the following caveats:
@@ -561,23 +561,19 @@
     - Phase 2: Semantic similarity caching (embedding-based)
     - Phase 3: Federated claim sharing across instances
-----
++----
++== 7. Summary ==
--== 7. Summary==
++=== Current State ===
--
--
--=== Current State===
--
  - POC1 implements core AKEL pipeline successfully
  - Claim dependency tracking is implemented
  - Multiple LLM providers supported
  - No persistent claim storage or caching
++=== Key Gaps from Specification ===
--=== Key Gaps from Specification===
--
  - No scenario extraction
  - No user/role system
  - No audit trail
@@ -584,13 +584,10 @@
  - No source track record updates
  - No review queue
++=== Recommended Next Steps ===
--=== Recommended Next Steps===
--
 . Implement claim caching layer
 . Separate claim vs article verdict generation
 . Add Redis for source/search caching
 . Implement tiered model selection
 . Add basic audit logging
--
--

Changes for page FactHarbor POC1 Architecture Analysis 1.Jan.26

Summary

Details

Applications

Navigation

Need help?