Last modified by Robert Schaub on 2026/02/08 08:12

From version 2.1
edited by Robert Schaub
on 2026/01/02 10:01
Change comment: There is no comment for this version
To version 10.1
edited by Robert Schaub
on 2026/01/02 10:13
Change comment: There is no comment for this version

Summary

Details

Page properties
Title
... ... @@ -1,1 +1,1 @@
1 -FactHarbor POC1 Architecture Analysis
1 +FactHarbor POC1 Architecture Analysis 01.Jan.26
Content
... ... @@ -1,12 +1,9 @@
1 1  = FactHarbor POC1 Architecture Analysis =
2 2  
3 -
4 4  **Version:** 2.6.17
5 5  **Analysis Date:** January 2026
6 6  **Document Purpose:** Technical diagrams, gap analysis, and optimization recommendations
7 7  
8 ------
9 -
10 10  ----
11 11  
12 12  == 1. AKEL Flow Diagram (with LLM and WebSearch Interactions) ==
... ... @@ -93,14 +93,125 @@
93 93   class UNDERSTAND,DECIDE,FETCHSRC,EXTRACT,VERDICT,REPORT step
94 94  {{/mermaid}}
95 95  
96 ------
93 +----
97 97  
98 -
99 99  == 2. ERD Data Model (Current POC1 Implementation) ==
100 100  
97 +**Data Objects ERD**
101 101  
102 102  {{mermaid}}
103 103  erDiagram
101 + ARTICLE ||--o{ CLAIM : "contains"
102 + ARTICLE ||--|| ARTICLE_VERDICT : "has"
103 + CLAIM ||--|| CLAIM_VERDICT : "has"
104 + CLAIM ||--o{ CLAIM : "depends on"
105 + CLAIM_VERDICT }o--o{ EVIDENCE : "supported by"
106 + SOURCE ||--o{ EVIDENCE : "provides"
107 + ARTICLE ||--o{ SOURCE : "references"
108 +
109 + ARTICLE {
110 + string id PK "Unique identifier (job ID)"
111 + string inputType "text | url"
112 + string inputValue "Original URL or text"
113 + string articleThesis "Main argument/thesis"
114 + string detectedInputType "question | claim | article"
115 + boolean isQuestion "True if input is a question"
116 + datetime createdAt "Analysis timestamp"
117 + datetime updatedAt "Last update"
118 + json distinctProceedings "Legal proceedings if any"
119 + boolean hasMultipleProceedings "Multi-proceeding flag"
120 + string proceedingContext "Context for proceedings"
121 + json logicalFallacies "Detected fallacies array"
122 + boolean isPseudoscience "Pseudoscience detection"
123 + string_array pseudoscienceCategories "Categories if detected"
124 + int llmCalls "Total LLM API calls"
125 + json searchQueries "All search queries performed"
126 + string schemaVersion "e.g. 2.6.17"
127 + }
128 +
129 + CLAIM {
130 + string id PK "SC1, SC2, C1, etc."
131 + string articleId FK "Parent article"
132 + string text "The claim statement"
133 + string type "legal | procedural | factual | evaluative"
134 + string claimRole "attribution | source | timing | core"
135 + string_array dependsOn "IDs of prerequisite claims"
136 + string_array keyEntities "Named entities in claim"
137 + boolean isCentral "Is this a central claim?"
138 + string relatedProceedingId "Linked proceeding if any"
139 + int startOffset "Position in original text"
140 + int endOffset "End position in original text"
141 + string approximatePosition "Descriptive position"
142 + }
143 +
144 + CLAIM_VERDICT {
145 + string id PK "Same as claim ID"
146 + string claimId FK "Reference to claim"
147 + string llmVerdict "WELL-SUPPORTED | PARTIALLY-SUPPORTED | UNCERTAIN | REFUTED"
148 + string verdict "True | Mostly True | Leaning True | Unverified | Leaning False | Mostly False | False"
149 + int confidence "0-100 LLM confidence"
150 + int truthPercentage "0-100 calibrated truth score"
151 + string riskTier "A (high) | B (medium) | C (low)"
152 + string reasoning "Explanation of verdict"
153 + string_array supportingFactIds "Evidence IDs supporting this"
154 + boolean dependencyFailed "True if prerequisite failed"
155 + string_array failedDependencies "Which deps failed"
156 + string highlightColor "green | light-green | yellow | orange | dark-orange | red | dark-red"
157 + boolean isPseudoscience "Pseudoscience flag"
158 + string escalationReason "Why verdict was escalated"
159 + }
160 +
161 + ARTICLE_VERDICT {
162 + string id PK "Same as article ID"
163 + string articleId FK "Reference to article"
164 + string llmArticleVerdict "Original LLM verdict"
165 + int llmArticleConfidence "Original LLM confidence"
166 + string articleVerdict "True | Mostly True | Leaning True | Unverified | Leaning False | Mostly False | False"
167 + int articleTruthPercentage "0-100 calibrated score"
168 + string articleVerdictReason "Why verdict differs from claims avg"
169 + int claimsAverageTruthPercentage "Average of claim verdicts"
170 + string claimsAverageVerdict "7-point average verdict"
171 + int claimsTotal "Total claims analyzed"
172 + int claimsSupported "Claims with truth >= 72%"
173 + int claimsUncertain "Claims with truth 43-71%"
174 + int claimsRefuted "Claims with truth < 43%"
175 + int centralClaimsTotal "Number of central claims"
176 + int centralClaimsSupported "Central claims supported"
177 + }
178 +
179 + EVIDENCE {
180 + string id PK "S1-F1, S1-F2 format"
181 + string sourceId FK "Reference to source"
182 + string claimId FK "Optional: specific claim this supports"
183 + string fact "The factual statement extracted"
184 + string category "legal_provision | evidence | expert_quote | statistic | event | criticism"
185 + string specificity "high | medium"
186 + string sourceExcerpt "Original text excerpt"
187 + string relatedProceedingId "Linked proceeding if any"
188 + boolean isContestedClaim "Is this a contested assertion"
189 + string claimSource "Who made contested claim"
190 + }
191 +
192 + SOURCE {
193 + string id PK "S1, S2, etc."
194 + string articleId FK "Parent article"
195 + string url "Full URL"
196 + string title "Page/document title"
197 + string domain "Extracted domain"
198 + int trackRecordScore "0-100 reliability score or null"
199 + string fullText "Extracted content"
200 + datetime fetchedAt "When content was fetched"
201 + string category "news | academic | government | legal"
202 + boolean fetchSuccess "True if fetch succeeded"
203 + string searchQuery "Which query found this"
204 + string mimeType "text/html | application/pdf"
205 + }
206 +{{/mermaid}}
207 +
208 +**Data Usage ERD**
209 +
210 +{{mermaid}}
211 +erDiagram
104 104   JOB ||--o{ JOB_EVENT : "has"
105 105   JOB ||--|| ANALYSIS_RESULT : "produces"
106 106   ANALYSIS_RESULT ||--o{ CLAIM_VERDICT : "contains"
... ... @@ -188,12 +188,10 @@
188 188   }
189 189  {{/mermaid}}
190 190  
191 ------
299 +----
192 192  
193 -
194 194  == 3. Overall Architecture with Interactions ==
195 195  
196 -
197 197  {{mermaid}}
198 198  flowchart TB
199 199   subgraph Client["🖥️ Client Layer"]
... ... @@ -287,16 +287,12 @@
287 287   class ANALYZE_API,JOBS_API,JOB_API,EVENTS_API,RUN_JOB api
288 288  {{/mermaid}}
289 289  
290 ------
396 +----
291 291  
292 -
293 293  == 4. Specification vs Implementation Gap Analysis ==
294 294  
295 -
296 -
297 297  === 4.1 Data Model Gaps ===
298 298  
299 -
300 300  | Specification Entity | POC1 Status | Gap Description |
301 301  |-|-|-|
302 302  | **Claim** | ⚠️ Partial | No persistent storage; claims exist only in JSON result. Missing: `status`, `confidence_score`, `risk_score`, `completeness_score`, `version`, `views`, `edit_count` |
... ... @@ -324,9 +324,8 @@
324 324  
325 325  === 4.3 Architecture Gaps ===
326 326  
327 -
328 328  | Spec Requirement | POC1 Status | Gap Description |
329 -||-|-|
430 +| |-|-|
330 330  | **Three-Layer Architecture** | ✅ Implemented | Interface (Next.js) → Processing (AKEL) → Data (SQLite) |
331 331  | **LLM Abstraction Layer** | ✅ Implemented | AI SDK supports multiple providers with failover |
332 332  | **PostgreSQL Primary DB** | ⚠️ Different | Using SQLite for simplicity (acceptable for POC) |
... ... @@ -337,24 +337,19 @@
337 337  
338 338  === 4.4 Publication & Review Gaps ===
339 339  
340 -
341 341  | Spec Feature | POC1 Status | Gap Description |
342 -||-|-|
442 +| |-|-|
343 343  | **Risk Tier Publication Rules** | ❌ Missing | All results published immediately regardless of tier |
344 344  | **Human Review Queue** | ❌ Missing | No review workflow |
345 345  | **AI-Generated Labeling** | ⚠️ Partial | Results show "AI analysis" but no formal labeling system |
346 346  | **Audit Rate Sampling** | ❌ Missing | No sampling audits |
347 347  
348 ------
448 +----
349 349  
350 -
351 351  == 5. Optimization Recommendations ==
352 352  
353 -
354 -
355 355  === 5.1 Cost Optimizations ===
356 356  
357 -
358 358  {{mermaid}}
359 359  pie title Current LLM Cost Distribution (Estimated per Analysis)
360 360   "Step 1: Understand" : 15
... ... @@ -363,7 +363,7 @@
363 363  {{/mermaid}}
364 364  
365 365  | Optimization | Estimated Savings | Implementation Effort |
366 -||-----||
462 +| |-| |
367 367  | **Cache claim understanding** | 30-50% on repeated claims | Medium |
368 368  | **Use Haiku for fact extraction** | 40% on Step 2 costs | Low (config change) |
369 369  | **Batch fact extraction** | 20% fewer API calls | Medium |
... ... @@ -372,7 +372,6 @@
372 372  
373 373  === 5.2 Timing Optimizations ===
374 374  
375 -
376 376  {{mermaid}}
377 377  gantt
378 378   title Current Analysis Timeline (Typical)
... ... @@ -399,7 +399,7 @@
399 399  {{/mermaid}}
400 400  
401 401  | Optimization | Time Savings | Notes |
402 -|||-----|
497 +| | |-|
403 403  | **Parallel source fetching** | Already implemented | Currently fetches 3 sources in parallel |
404 404  | **Streaming LLM responses** | 20-30% perceived | User sees progress faster |
405 405  | **Search query batching** | 10-15% | Send multiple queries to search API |
... ... @@ -408,7 +408,6 @@
408 408  
409 409  === 5.3 Priority Recommendations ===
410 410  
411 -
412 412  1. **HIGH PRIORITY - Implement Claim Caching**
413 413   - Cache claim verdicts by content hash
414 414   - Reduces costs for repeated/similar claims
... ... @@ -424,16 +424,12 @@
424 424   - Cache search results (1h TTL)
425 425   - Reduces external API calls
426 426  
427 ------
521 +----
428 428  
429 -
430 430  == 6. Separated Verdict Architecture Proposal ==
431 431  
432 -
433 -
434 434  === 6.1 Current Architecture ===
435 435  
436 -
437 437  {{mermaid}}
438 438  flowchart LR
439 439   subgraph Current["Current: Monolithic Analysis"]
... ... @@ -449,10 +449,8 @@
449 449  - No caching of individual claim verdicts
450 450  - Article verdict tightly coupled to claim extraction
451 451  
452 -
453 453  === 6.2 Proposed Separated Architecture ===
454 454  
455 -
456 456  {{mermaid}}
457 457  flowchart TB
458 458   subgraph Input["Input Processing"]
... ... @@ -505,12 +505,10 @@
505 505   class CONTEXT,ARTICLE_VERDICT dynamic
506 506  {{/mermaid}}
507 507  
508 -
509 509  === 6.3 Benefits Analysis ===
510 510  
511 -
512 512  | Benefit | Impact | Rationale |
513 -|-| |-----|
599 +|-| |-|
514 514  | **Cost Reduction** | 40-70% for repeated claims | Many articles share common claims (e.g., "COVID vaccines are safe") |
515 515  | **Faster Analysis** | 50%+ for cached claims | Skip research + LLM calls for known claims |
516 516  | **Consistency** | High | Same claim always gets same verdict (until cache expires) |
... ... @@ -554,13 +554,10 @@
554 554   - Phase 2: Semantic similarity caching (embedding-based)
555 555   - Phase 3: Federated claim sharing across instances
556 556  
557 ------
643 +----
558 558  
559 -
560 560  == 7. Summary ==
561 561  
562 -
563 -
564 564  === Current State ===
565 565  
566 566  - POC1 implements core AKEL pipeline successfully
... ... @@ -568,7 +568,6 @@
568 568  - Multiple LLM providers supported
569 569  - No persistent claim storage or caching
570 570  
571 -
572 572  === Key Gaps from Specification ===
573 573  
574 574  - No scenario extraction
... ... @@ -577,7 +577,6 @@
577 577  - No source track record updates
578 578  - No review queue
579 579  
580 -
581 581  === Recommended Next Steps ===
582 582  
583 583  1. Implement claim caching layer