Last modified by Robert Schaub on 2026/02/08 08:12

From version 1.1
edited by Robert Schaub
on 2026/01/02 09:59
Change comment: There is no comment for this version
To version 11.1
edited by Robert Schaub
on 2026/01/02 10:14
Change comment: There is no comment for this version

Summary

Details

Page properties
Title
... ... @@ -1,1 +1,1 @@
1 -FactHarbor POC1 Architecture Analysis
1 +FactHarbor POC1 Architecture Analysis 1.Jan.26
Content
... ... @@ -1,14 +1,12 @@
1 += FactHarbor POC1 Architecture Analysis =
1 1  
2 -= FactHarbor POC1 Architecture Analysis=
3 -
4 -
5 5  **Version:** 2.6.17
6 6  **Analysis Date:** January 2026
7 7  **Document Purpose:** Technical diagrams, gap analysis, and optimization recommendations
8 8  
9 ----
7 +----
10 10  
11 -== 1. AKEL Flow Diagram (with LLM and WebSearch Interactions)==
9 +== 1. AKEL Flow Diagram (with LLM and WebSearch Interactions) ==
12 12  
13 13  
14 14  {{mermaid}}
... ... @@ -92,12 +92,123 @@
92 92   class UNDERSTAND,DECIDE,FETCHSRC,EXTRACT,VERDICT,REPORT step
93 93  {{/mermaid}}
94 94  
95 ----
93 +----
96 96  
95 +== 2. ERD Data Model (Current POC1 Implementation) ==
97 97  
98 -== 2. ERD Data Model (Current POC1 Implementation)==
97 +**Data Objects ERD**
99 99  
99 +{{mermaid}}
100 +erDiagram
101 + ARTICLE ||--o{ CLAIM : "contains"
102 + ARTICLE ||--|| ARTICLE_VERDICT : "has"
103 + CLAIM ||--|| CLAIM_VERDICT : "has"
104 + CLAIM ||--o{ CLAIM : "depends on"
105 + CLAIM_VERDICT }o--o{ EVIDENCE : "supported by"
106 + SOURCE ||--o{ EVIDENCE : "provides"
107 + ARTICLE ||--o{ SOURCE : "references"
100 100  
109 + ARTICLE {
110 + string id PK "Unique identifier (job ID)"
111 + string inputType "text | url"
112 + string inputValue "Original URL or text"
113 + string articleThesis "Main argument/thesis"
114 + string detectedInputType "question | claim | article"
115 + boolean isQuestion "True if input is a question"
116 + datetime createdAt "Analysis timestamp"
117 + datetime updatedAt "Last update"
118 + json distinctProceedings "Legal proceedings if any"
119 + boolean hasMultipleProceedings "Multi-proceeding flag"
120 + string proceedingContext "Context for proceedings"
121 + json logicalFallacies "Detected fallacies array"
122 + boolean isPseudoscience "Pseudoscience detection"
123 + string_array pseudoscienceCategories "Categories if detected"
124 + int llmCalls "Total LLM API calls"
125 + json searchQueries "All search queries performed"
126 + string schemaVersion "e.g. 2.6.17"
127 + }
128 +
129 + CLAIM {
130 + string id PK "SC1, SC2, C1, etc."
131 + string articleId FK "Parent article"
132 + string text "The claim statement"
133 + string type "legal | procedural | factual | evaluative"
134 + string claimRole "attribution | source | timing | core"
135 + string_array dependsOn "IDs of prerequisite claims"
136 + string_array keyEntities "Named entities in claim"
137 + boolean isCentral "Is this a central claim?"
138 + string relatedProceedingId "Linked proceeding if any"
139 + int startOffset "Position in original text"
140 + int endOffset "End position in original text"
141 + string approximatePosition "Descriptive position"
142 + }
143 +
144 + CLAIM_VERDICT {
145 + string id PK "Same as claim ID"
146 + string claimId FK "Reference to claim"
147 + string llmVerdict "WELL-SUPPORTED | PARTIALLY-SUPPORTED | UNCERTAIN | REFUTED"
148 + string verdict "True | Mostly True | Leaning True | Unverified | Leaning False | Mostly False | False"
149 + int confidence "0-100 LLM confidence"
150 + int truthPercentage "0-100 calibrated truth score"
151 + string riskTier "A (high) | B (medium) | C (low)"
152 + string reasoning "Explanation of verdict"
153 + string_array supportingFactIds "Evidence IDs supporting this"
154 + boolean dependencyFailed "True if prerequisite failed"
155 + string_array failedDependencies "Which deps failed"
156 + string highlightColor "green | light-green | yellow | orange | dark-orange | red | dark-red"
157 + boolean isPseudoscience "Pseudoscience flag"
158 + string escalationReason "Why verdict was escalated"
159 + }
160 +
161 + ARTICLE_VERDICT {
162 + string id PK "Same as article ID"
163 + string articleId FK "Reference to article"
164 + string llmArticleVerdict "Original LLM verdict"
165 + int llmArticleConfidence "Original LLM confidence"
166 + string articleVerdict "True | Mostly True | Leaning True | Unverified | Leaning False | Mostly False | False"
167 + int articleTruthPercentage "0-100 calibrated score"
168 + string articleVerdictReason "Why verdict differs from claims avg"
169 + int claimsAverageTruthPercentage "Average of claim verdicts"
170 + string claimsAverageVerdict "7-point average verdict"
171 + int claimsTotal "Total claims analyzed"
172 + int claimsSupported "Claims with truth >= 72%"
173 + int claimsUncertain "Claims with truth 43-71%"
174 + int claimsRefuted "Claims with truth < 43%"
175 + int centralClaimsTotal "Number of central claims"
176 + int centralClaimsSupported "Central claims supported"
177 + }
178 +
179 + EVIDENCE {
180 + string id PK "S1-F1, S1-F2 format"
181 + string sourceId FK "Reference to source"
182 + string claimId FK "Optional: specific claim this supports"
183 + string fact "The factual statement extracted"
184 + string category "legal_provision | evidence | expert_quote | statistic | event | criticism"
185 + string specificity "high | medium"
186 + string sourceExcerpt "Original text excerpt"
187 + string relatedProceedingId "Linked proceeding if any"
188 + boolean isContestedClaim "Is this a contested assertion"
189 + string claimSource "Who made contested claim"
190 + }
191 +
192 + SOURCE {
193 + string id PK "S1, S2, etc."
194 + string articleId FK "Parent article"
195 + string url "Full URL"
196 + string title "Page/document title"
197 + string domain "Extracted domain"
198 + int trackRecordScore "0-100 reliability score or null"
199 + string fullText "Extracted content"
200 + datetime fetchedAt "When content was fetched"
201 + string category "news | academic | government | legal"
202 + boolean fetchSuccess "True if fetch succeeded"
203 + string searchQuery "Which query found this"
204 + string mimeType "text/html | application/pdf"
205 + }
206 +{{/mermaid}}
207 +
208 +**Data Usage ERD**
209 +
101 101  {{mermaid}}
102 102  erDiagram
103 103   JOB ||--o{ JOB_EVENT : "has"
... ... @@ -187,12 +187,10 @@
187 187   }
188 188  {{/mermaid}}
189 189  
190 ----
299 +----
191 191  
301 +== 3. Overall Architecture with Interactions ==
192 192  
193 -== 3. Overall Architecture with Interactions==
194 -
195 -
196 196  {{mermaid}}
197 197  flowchart TB
198 198   subgraph Client["🖥️ Client Layer"]
... ... @@ -286,77 +286,64 @@
286 286   class ANALYZE_API,JOBS_API,JOB_API,EVENTS_API,RUN_JOB api
287 287  {{/mermaid}}
288 288  
289 ----
396 +----
290 290  
398 +== 4. Specification vs Implementation Gap Analysis ==
291 291  
292 -== 4. Specification vs Implementation Gap Analysis==
400 +=== 4.1 Data Model Gaps ===
293 293  
402 +| Specification Entity | POC1 Status | Gap Description |
403 +|-|-|-|
404 +| **Claim** | ⚠️ Partial | No persistent storage; claims exist only in JSON result. Missing: `status`, `confidence_score`, `risk_score`, `completeness_score`, `version`, `views`, `edit_count` |
405 +| **Evidence** | ⚠️ Partial | Implemented as `ExtractedFact` but lacks: `supports` enum, proper `relevance_score` |
406 +| **Source** | ⚠️ Partial | `FetchedSource` exists but missing: `type` enum, `accuracy_history`, `correction_frequency`, weekly update scheduler |
407 +| **Scenario** | ❌ Missing | Not implemented. Claims are evaluated directly without scenario contexts |
408 +| **Verdict** | ⚠️ Partial | `ClaimVerdict` exists but missing: `likelihood_range`, `uncertainty_factors` array, proper `explanation_summary` |
409 +| **User** | ❌ Missing | No user authentication or role system |
410 +| **Edit** | ❌ Missing | No audit trail for changes |
294 294  
295 -
296 -=== 4.1 Data Model Gaps===
297 -
298 -
299 -| Specification Entity | POC1 Status | Gap Description |
300 -|---------------------|-------------|-----------------|
301 -| **Claim** | ⚠️ Partial | No persistent storage; claims exist only in JSON result. Missing: `status`, `confidence_score`, `risk_score`, `completeness_score`, `version`, `views`, `edit_count` |
302 -| **Evidence** | ⚠️ Partial | Implemented as `ExtractedFact` but lacks: `supports` enum, proper `relevance_score` |
303 -| **Source** | ⚠️ Partial | `FetchedSource` exists but missing: `type` enum, `accuracy_history`, `correction_frequency`, weekly update scheduler |
304 -| **Scenario** | ❌ Missing | Not implemented. Claims are evaluated directly without scenario contexts |
305 -| **Verdict** | ⚠️ Partial | `ClaimVerdict` exists but missing: `likelihood_range`, `uncertainty_factors` array, proper `explanation_summary` |
306 -| **User** | ❌ Missing | No user authentication or role system |
307 -| **Edit** | ❌ Missing | No audit trail for changes |
308 -
309 -
310 310  === 4.2 AKEL Component Gaps ===
311 311  
312 -| Spec Component | POC1 Status | Gap Description |
313 -|----------------|-------------|-----------------|
314 -| **AKEL Orchestrator** | ✅ Implemented | `runAnalysis()` function serves this role |
315 -| **Claim Extractor** | ✅ Implemented | `understandClaim()` with claim role/dependency tracking |
316 -| **Claim Classifier** | ⚠️ Partial | Risk tier (A/B/C) assigned, but no domain classification |
317 -| **Scenario Generator** | ❌ Missing | Claims evaluated without scenario extraction |
318 -| **Evidence Summarizer** | ✅ Implemented | `extractFacts()` function |
319 -| **Contradiction Detector** | ⚠️ Partial | `isContestedClaim` flag exists but no active contradiction search |
320 -| **Quality Gate Validator** | ❌ Missing | No source quality gates, no mandatory checks |
321 -| **Audit Sampling Scheduler** | ❌ Missing | No audit system |
322 -| **Embedding Handler** | ❌ Missing | Not needed for POC |
323 -| **Federation Sync** | ❌ Missing | Not needed for POC |
414 +| Spec Component | POC1 Status | Gap Description |
415 +| |-|-|
416 +| **AKEL Orchestrator** | ✅ Implemented | `runAnalysis()` function serves this role |
417 +| **Claim Extractor** | ✅ Implemented | `understandClaim()` with claim role/dependency tracking |
418 +| **Claim Classifier** | ⚠️ Partial | Risk tier (A/B/C) assigned, but no domain classification |
419 +| **Scenario Generator** | ❌ Missing | Claims evaluated without scenario extraction |
420 +| **Evidence Summarizer** | ✅ Implemented | `extractFacts()` function |
421 +| **Contradiction Detector** | ⚠️ Partial | `isContestedClaim` flag exists but no active contradiction search |
422 +| **Quality Gate Validator** | ❌ Missing | No source quality gates, no mandatory checks |
423 +| **Audit Sampling Scheduler** | ❌ Missing | No audit system |
424 +| **Embedding Handler** | ❌ Missing | Not needed for POC |
425 +| **Federation Sync** | ❌ Missing | Not needed for POC |
324 324  
427 +=== 4.3 Architecture Gaps ===
325 325  
326 -=== 4.3 Architecture Gaps===
429 +| Spec Requirement | POC1 Status | Gap Description |
430 +| |-|-|
431 +| **Three-Layer Architecture** | ✅ Implemented | Interface (Next.js) → Processing (AKEL) → Data (SQLite) |
432 +| **LLM Abstraction Layer** | ✅ Implemented | AI SDK supports multiple providers with failover |
433 +| **PostgreSQL Primary DB** | ⚠️ Different | Using SQLite for simplicity (acceptable for POC) |
434 +| **Redis Caching** | ❌ Missing | No caching layer |
435 +| **S3 Archival** | ❌ Missing | No long-term storage |
436 +| **Background Jobs** | ❌ Missing | No scheduler for source updates, cache warming |
437 +| **Quality Monitoring** | ⚠️ Partial | LLM call counting exists, but no anomaly detection |
327 327  
439 +=== 4.4 Publication & Review Gaps ===
328 328  
329 -| Spec Requirement | POC1 Status | Gap Description |
330 -|------------------|-------------|-----------------|
331 -| **Three-Layer Architecture** | ✅ Implemented | Interface (Next.js) → Processing (AKEL) → Data (SQLite) |
332 -| **LLM Abstraction Layer** | ✅ Implemented | AI SDK supports multiple providers with failover |
333 -| **PostgreSQL Primary DB** | ⚠️ Different | Using SQLite for simplicity (acceptable for POC) |
334 -| **Redis Caching** | ❌ Missing | No caching layer |
335 -| **S3 Archival** | ❌ Missing | No long-term storage |
336 -| **Background Jobs** | ❌ Missing | No scheduler for source updates, cache warming |
337 -| **Quality Monitoring** | ⚠️ Partial | LLM call counting exists, but no anomaly detection |
441 +| Spec Feature | POC1 Status | Gap Description |
442 +| |-|-|
443 +| **Risk Tier Publication Rules** | ❌ Missing | All results published immediately regardless of tier |
444 +| **Human Review Queue** | ❌ Missing | No review workflow |
445 +| **AI-Generated Labeling** | ⚠️ Partial | Results show "AI analysis" but no formal labeling system |
446 +| **Audit Rate Sampling** | ❌ Missing | No sampling audits |
338 338  
448 +----
339 339  
340 -=== 4.4 Publication & Review Gaps===
450 +== 5. Optimization Recommendations ==
341 341  
452 +=== 5.1 Cost Optimizations ===
342 342  
343 -| Spec Feature | POC1 Status | Gap Description |
344 -|--------------|-------------|-----------------|
345 -| **Risk Tier Publication Rules** | ❌ Missing | All results published immediately regardless of tier |
346 -| **Human Review Queue** | ❌ Missing | No review workflow |
347 -| **AI-Generated Labeling** | ⚠️ Partial | Results show "AI analysis" but no formal labeling system |
348 -| **Audit Rate Sampling** | ❌ Missing | No sampling audits |
349 -
350 ----
351 -
352 -
353 -== 5. Optimization Recommendations==
354 -
355 -
356 -
357 -=== 5.1 Cost Optimizations===
358 -
359 -
360 360  {{mermaid}}
361 361  pie title Current LLM Cost Distribution (Estimated per Analysis)
362 362   "Step 1: Understand" : 15
... ... @@ -364,18 +364,16 @@
364 364   "Step 3: Verdicts" : 25
365 365  {{/mermaid}}
366 366  
367 -| Optimization | Estimated Savings | Implementation Effort |
368 -|--------------|-------------------|----------------------|
369 -| **Cache claim understanding** | 30-50% on repeated claims | Medium |
370 -| **Use Haiku for fact extraction** | 40% on Step 2 costs | Low (config change) |
371 -| **Batch fact extraction** | 20% fewer API calls | Medium |
372 -| **Skip search for known claims** | 50%+ for cached claims | High (needs claim DB) |
373 -| **Reduce max iterations** | Linear reduction | Low (config change) |
461 +| Optimization | Estimated Savings | Implementation Effort |
462 +| |-| |
463 +| **Cache claim understanding** | 30-50% on repeated claims | Medium |
464 +| **Use Haiku for fact extraction** | 40% on Step 2 costs | Low (config change) |
465 +| **Batch fact extraction** | 20% fewer API calls | Medium |
466 +| **Skip search for known claims** | 50%+ for cached claims | High (needs claim DB) |
467 +| **Reduce max iterations** | Linear reduction | Low (config change) |
374 374  
469 +=== 5.2 Timing Optimizations ===
375 375  
376 -=== 5.2 Timing Optimizations===
377 -
378 -
379 379  {{mermaid}}
380 380  gantt
381 381   title Current Analysis Timeline (Typical)
... ... @@ -401,18 +401,16 @@
401 401   Generate Verdicts :b5, after b4, 10s
402 402  {{/mermaid}}
403 403  
404 -| Optimization | Time Savings | Notes |
405 -|--------------|--------------|-------|
406 -| **Parallel source fetching** | Already implemented | Currently fetches 3 sources in parallel |
407 -| **Streaming LLM responses** | 20-30% perceived | User sees progress faster |
408 -| **Search query batching** | 10-15% | Send multiple queries to search API |
409 -| **Reduce prompt size** | 5-10% per call | Optimize system prompts |
410 -| **Use faster models for extraction** | 30-40% on Step 2 | Claude Haiku vs Sonnet |
496 +| Optimization | Time Savings | Notes |
497 +| | |-|
498 +| **Parallel source fetching** | Already implemented | Currently fetches 3 sources in parallel |
499 +| **Streaming LLM responses** | 20-30% perceived | User sees progress faster |
500 +| **Search query batching** | 10-15% | Send multiple queries to search API |
501 +| **Reduce prompt size** | 5-10% per call | Optimize system prompts |
502 +| **Use faster models for extraction** | 30-40% on Step 2 | Claude Haiku vs Sonnet |
411 411  
504 +=== 5.3 Priority Recommendations ===
412 412  
413 -=== 5.3 Priority Recommendations===
414 -
415 -
416 416  1. **HIGH PRIORITY - Implement Claim Caching**
417 417   - Cache claim verdicts by content hash
418 418   - Reduces costs for repeated/similar claims
... ... @@ -428,16 +428,12 @@
428 428   - Cache search results (1h TTL)
429 429   - Reduces external API calls
430 430  
431 ----
521 +----
432 432  
523 +== 6. Separated Verdict Architecture Proposal ==
433 433  
434 -== 6. Separated Verdict Architecture Proposal==
525 +=== 6.1 Current Architecture ===
435 435  
436 -
437 -
438 -=== 6.1 Current Architecture===
439 -
440 -
441 441  {{mermaid}}
442 442  flowchart LR
443 443   subgraph Current["Current: Monolithic Analysis"]
... ... @@ -453,10 +453,8 @@
453 453  - No caching of individual claim verdicts
454 454  - Article verdict tightly coupled to claim extraction
455 455  
542 +=== 6.2 Proposed Separated Architecture ===
456 456  
457 -=== 6.2 Proposed Separated Architecture===
458 -
459 -
460 460  {{mermaid}}
461 461  flowchart TB
462 462   subgraph Input["Input Processing"]
... ... @@ -509,30 +509,25 @@
509 509   class CONTEXT,ARTICLE_VERDICT dynamic
510 510  {{/mermaid}}
511 511  
596 +=== 6.3 Benefits Analysis ===
512 512  
513 -=== 6.3 Benefits Analysis===
598 +| Benefit | Impact | Rationale |
599 +|-| |-|
600 +| **Cost Reduction** | 40-70% for repeated claims | Many articles share common claims (e.g., "COVID vaccines are safe") |
601 +| **Faster Analysis** | 50%+ for cached claims | Skip research + LLM calls for known claims |
602 +| **Consistency** | High | Same claim always gets same verdict (until cache expires) |
603 +| **Freshness Control** | Configurable TTL | Balance consistency vs. new evidence |
604 +| **Scalability** | Linear improvement | More users = higher cache hit rate |
514 514  
515 -
516 -| Benefit | Impact | Rationale |
517 -|---------|--------|-----------|
518 -| **Cost Reduction** | 40-70% for repeated claims | Many articles share common claims (e.g., "COVID vaccines are safe") |
519 -| **Faster Analysis** | 50%+ for cached claims | Skip research + LLM calls for known claims |
520 -| **Consistency** | High | Same claim always gets same verdict (until cache expires) |
521 -| **Freshness Control** | Configurable TTL | Balance consistency vs. new evidence |
522 -| **Scalability** | Linear improvement | More users = higher cache hit rate |
523 -
524 -
525 525  === 6.4 Implementation Considerations ===
526 526  
527 527  **Claim Hashing Strategy:**
528 -{{code language="typescript"}}
529 -function getClaimHash(claim: string): string {
609 +{{code language="typescript"}}function getClaimHash(claim: string): string {
530 530   // Normalize: lowercase, remove punctuation, stem words
531 531   const normalized = normalize(claim);
532 532   // Hash for cache key
533 533   return crypto.createHash('sha256').update(normalized).digest('hex').slice(0, 16);
534 -}
535 -{{/code}}
614 +}{{/code}}
536 536  
537 537  **Cache Invalidation Triggers:**
538 538  - TTL expiration (default 7 days)
... ... @@ -545,7 +545,7 @@
545 545  - Same claims in different article contexts may yield different article verdicts
546 546  - Example: "Vaccines are safe" + "Vaccines cause autism" → article may be misleading even if first claim is true
547 547  
548 -### 6.5 Recommendation
627 +### 6.5 Recommendation##
549 549  
550 550  **YES, separating is beneficial** with the following caveats:
551 551  
... ... @@ -561,23 +561,19 @@
561 561   - Phase 2: Semantic similarity caching (embedding-based)
562 562   - Phase 3: Federated claim sharing across instances
563 563  
564 ----
643 +----
565 565  
645 +== 7. Summary ==
566 566  
567 -== 7. Summary==
647 +=== Current State ===
568 568  
569 -
570 -
571 -=== Current State===
572 -
573 573  - POC1 implements core AKEL pipeline successfully
574 574  - Claim dependency tracking is implemented
575 575  - Multiple LLM providers supported
576 576  - No persistent claim storage or caching
577 577  
654 +=== Key Gaps from Specification ===
578 578  
579 -=== Key Gaps from Specification===
580 -
581 581  - No scenario extraction
582 582  - No user/role system
583 583  - No audit trail
... ... @@ -584,13 +584,10 @@
584 584  - No source track record updates
585 585  - No review queue
586 586  
662 +=== Recommended Next Steps ===
587 587  
588 -=== Recommended Next Steps===
589 -
590 590  1. Implement claim caching layer
591 591  2. Separate claim vs article verdict generation
592 592  3. Add Redis for source/search caching
593 593  4. Implement tiered model selection
594 594  5. Add basic audit logging
595 -
596 -