📥 Import Strategy: Vom Ballast zur Sternenkarte
Crumbforest Import Documentation
Version: 1.0
Datum: 2026-01-22
"Alter 'Ballast' wird zur Sternenkarte.
Jeder importierte Post ist ein Geschenk."
🎯 Warum Import?
Das Problem (Klassisch)
Neue Plattform:
├─ Tag 1: 0 Inhalte
├─ Monat 1: 100 Inhalte (mühsam)
├─ Monat 6: 500 Inhalte (langsam)
└─ Jahr 1: 1.000 Inhalte
Vektor:
├─ Tag 1: 0 Embeddings
├─ Monat 6: 500 Embeddings
└─ Jahr 1: 1.000 Embeddings
Token-Kosten:
└─ Hoch für 12+ Monate!
Die Lösung (Import First)
Import:
├─ Tag 1: 5.000 Posts importiert! ✨
├─ Tag 2: 5.000 Embeddings erstellt!
└─ Tag 3: System ist Level 2!
Vektor:
├─ Tag 1: 5.000 Embeddings (Import)
├─ Monat 6: 10.000 Embeddings (+Laufzeit)
└─ Jahr 1: 20.000 Embeddings
Token-Kosten:
└─ Start bei 30€/Monat (statt 100€!)
└─ Nach 6M: 3€/Monat
└─ 56% Ersparnis über 2 Jahre!
📦 Was wird importiert?
Alle Text-Quellen
✅ WordPress (Posts, Pages, Comments)
✅ Forum-Software (phpBB, Discourse, etc.)
✅ Wikis (MediaWiki, DokuWiki, etc.)
✅ Markdown-Repos (GitHub, GitLab)
✅ JSON Backups (Exports)
✅ CSV Dateien (Strukturierte Daten)
✅ PDFs (via OCR, optional)
✅ HTML Archives (Alte Webseiten)
Was NICHT importiert wird
❌ Bilder (nur Metadaten)
❌ Videos (nur Metadaten)
❌ Binärdateien
❌ Private/Sensible Daten (ohne Consent)
❌ Spam/Low-Quality Content
🔧 Die Import-Pipeline
Schritt 1: Connect (Verbinden)
<?php
/**
* Import Configuration
*/
$sources = [
'wp_ozm' => [
'type' => 'mysql',
'host' => 'localhost',
'db' => 'wp_ozm',
'user' => 'root',
'pass' => 'secret',
'description' => 'OZM Hauptblog'
],
'wp_legacy' => [
'type' => 'mysql',
'host' => 'old-server.local',
'db' => 'wp_legacy',
'user' => 'readonly',
'pass' => 'safe123',
'description' => 'Alter Legacy Blog'
],
'json_backup' => [
'type' => 'json',
'path' => '/backups/export_2024.json',
'description' => 'JSON Export 2024'
],
'markdown_docs' => [
'type' => 'files',
'path' => '/docs/**/*.md',
'description' => 'Markdown Dokumentation'
]
];
/**
* Create Adapters
*/
foreach ($sources as $name => $config) {
$adapters[$name] = AdapterFactory::create($config);
}
Schritt 2: Extract (Extrahieren)
<?php
/**
* Extract posts from all sources
*/
$all_posts = [];
foreach ($adapters as $source_name => $adapter) {
echo "📥 Extracting from: $source_name\n";
try {
// Connect
if (!$adapter->connect()) {
echo "❌ Connection failed\n";
continue;
}
// Get posts
$posts = $adapter->getAllPosts();
echo "✅ Found " . count($posts) . " posts\n";
// Add source info
foreach ($posts as &$post) {
$post['_source'] = $source_name;
$post['_imported_at'] = time();
}
$all_posts = array_merge($all_posts, $posts);
// Close
$adapter->close();
} catch (Exception $e) {
echo "❌ Error: " . $e->getMessage() . "\n";
}
}
echo "\n📊 Total extracted: " . count($all_posts) . " posts\n";
Beispiel-Extraktion (WordPress)
public function getAllPosts(): array {
$sql = "
SELECT
p.ID,
p.post_title,
p.post_content,
p.post_excerpt,
p.post_date,
p.post_status,
p.post_type,
p.post_name as slug,
u.display_name as author,
-- Tags (via wp_terms)
GROUP_CONCAT(
DISTINCT CASE
WHEN tt.taxonomy = 'post_tag'
THEN t.name
END
) as tags,
-- Categories (via wp_terms)
GROUP_CONCAT(
DISTINCT CASE
WHEN tt.taxonomy = 'category'
THEN t.name
END
) as categories
FROM wp_posts p
LEFT JOIN wp_users u ON p.post_author = u.ID
LEFT JOIN wp_term_relationships tr ON p.ID = tr.object_id
LEFT JOIN wp_term_taxonomy tt ON tr.term_taxonomy_id = tt.term_taxonomy_id
LEFT JOIN wp_terms t ON tt.term_id = t.term_id
WHERE p.post_status = 'publish'
AND p.post_type IN ('post', 'page')
GROUP BY p.ID
ORDER BY p.post_date DESC
";
return $this->query($sql);
}
Schritt 3: Transform (Normalisieren)
<?php
/**
* Normalize posts to common format
*/
function normalize_post(array $raw_post, string $source): array {
// Detect source format
if (isset($raw_post['post_title'])) {
// WordPress format
return [
'id' => 'wp_' . $source . '_' . $raw_post['ID'],
'title' => $raw_post['post_title'],
'content' => $raw_post['post_content'],
'excerpt' => $raw_post['post_excerpt'] ?? '',
'date' => $raw_post['post_date'],
'slug' => $raw_post['post_name'] ?? '',
'author' => $raw_post['author'] ?? 'unknown',
'tags' => explode(',', $raw_post['tags'] ?? ''),
'categories' => explode(',', $raw_post['categories'] ?? ''),
'source' => $source,
'source_id' => $raw_post['ID']
];
}
elseif (isset($raw_post['title'])) {
// Generic JSON format
return [
'id' => 'json_' . $source . '_' . ($raw_post['id'] ?? uniqid()),
'title' => $raw_post['title'],
'content' => $raw_post['content'] ?? $raw_post['body'],
'excerpt' => $raw_post['excerpt'] ?? '',
'date' => $raw_post['created_at'] ?? $raw_post['date'],
'slug' => $raw_post['slug'] ?? '',
'author' => $raw_post['author'] ?? 'unknown',
'tags' => $raw_post['tags'] ?? [],
'categories' => $raw_post['categories'] ?? [],
'source' => $source,
'source_id' => $raw_post['id'] ?? null
];
}
else {
throw new Exception("Unknown post format");
}
}
// Normalize all
$normalized_posts = [];
foreach ($all_posts as $post) {
try {
$normalized = normalize_post($post, $post['_source']);
$normalized_posts[] = $normalized;
} catch (Exception $e) {
echo "⚠️ Skip: " . $e->getMessage() . "\n";
}
}
Schritt 4: Log (Events schreiben)
<?php
require_once 'kernel.php';
/**
* Log import events
*/
foreach ($normalized_posts as $post) {
// Create import event
kernel_log('import.post', [
'post_id' => $post['id'],
'title' => $post['title'],
'content' => $post['content'],
'excerpt' => $post['excerpt'],
'date' => $post['date'],
'slug' => $post['slug'],
'author' => $post['author'],
'tags' => array_filter($post['tags']),
'categories' => array_filter($post['categories']),
'source' => $post['source'],
'source_id' => $post['source_id']
], 'importer');
echo "."; // Progress indicator
}
echo "\n✅ Logged " . count($normalized_posts) . " import events\n";
Schritt 5: Embed (Vektorisieren)
#!/usr/bin/env python3
"""
embed_imports.py
Erstellt Embeddings für alle import.post Events
"""
import json
import requests
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct
from uuid import uuid4
# Config
OLLAMA_URL = "http://localhost:11434/api/embeddings"
QDRANT_URL = "http://localhost:6333"
KERNEL_FILE = "data/kernel.jsonl"
COLLECTION = "wald_knowledge"
# Init
qdrant = QdrantClient(url=QDRANT_URL)
def embed_text(text: str) -> list:
"""Create embedding using Ollama (local, 0€!)"""
response = requests.post(
OLLAMA_URL,
json={
"model": "nomic-embed-text",
"prompt": text[:5000] # Limit to 5k chars
},
timeout=30
)
if response.status_code == 200:
return response.json()['embedding']
else:
raise Exception(f"Ollama error: {response.text}")
def process_imports():
"""Process all import.post events"""
embedded_count = 0
error_count = 0
with open(KERNEL_FILE, 'r') as f:
for line_num, line in enumerate(f, 1):
if not line.strip():
continue
try:
event = json.loads(line)
# Only process import.post events
if not event['type'].startswith('import.post'):
continue
# Skip if already embedded
if 'embedded' in event.get('meta', {}):
continue
payload = event['payload']
# Combine title + content for embedding
text = f"{payload['title']}\n\n{payload['content']}"
# Create embedding
embedding = embed_text(text)
# Upsert to Qdrant
qdrant.upsert(
collection_name=COLLECTION,
points=[
PointStruct(
id=str(uuid4()),
vector=embedding,
payload={
'type': 'import',
'post_id': payload['post_id'],
'title': payload['title'],
'excerpt': payload['excerpt'],
'date': payload['date'],
'author': payload['author'],
'tags': payload['tags'],
'categories': payload['categories'],
'source': payload['source'],
'content_preview': text[:500]
}
)
]
)
embedded_count += 1
if embedded_count % 100 == 0:
print(f"✅ Embedded {embedded_count} posts...")
except Exception as e:
error_count += 1
print(f"❌ Line {line_num}: {e}")
print(f"\n✅ Total embedded: {embedded_count}")
print(f"❌ Errors: {error_count}")
# Update collection stats
info = qdrant.get_collection(COLLECTION)
print(f"\n📊 Collection '{COLLECTION}':")
print(f" Points: {info.points_count}")
print(f" Status: {info.status}")
if __name__ == "__main__":
process_imports()
Schritt 6: Verify (Prüfen)
#!/bin/bash
# verify_import.sh
echo "🔍 Import Verification"
echo "====================="
echo ""
# Check kernel events
echo "📊 Kernel Events:"
IMPORT_COUNT=$(cat data/kernel.jsonl | grep '"type":"import.post"' | wc -l)
echo " Import Events: $IMPORT_COUNT"
# Check Qdrant
echo ""
echo "📊 Qdrant Collection:"
curl -s http://localhost:6333/collections/wald_knowledge | jq '{
points: .result.points_count,
status: .result.status,
vectors: .result.config.params.vectors
}'
# Sample search test
echo ""
echo "🔍 Sample Search Test:"
echo " Query: 'nullfeld'"
python3 << EOF
import requests
from qdrant_client import QdrantClient
qdrant = QdrantClient("http://localhost:6333")
# Embed query
embed_response = requests.post(
"http://localhost:11434/api/embeddings",
json={"model": "nomic-embed-text", "prompt": "nullfeld"}
)
query_vector = embed_response.json()['embedding']
# Search
results = qdrant.search(
collection_name="wald_knowledge",
query_vector=query_vector,
limit=3
)
for i, hit in enumerate(results, 1):
print(f" {i}. {hit.payload['title'][:50]}... (Score: {hit.score:.3f})")
EOF
echo ""
echo "✅ Import verification complete!"
📋 Import-Workflows
Workflow 1: Single WordPress Blog
#!/bin/bash
# import_single_wp.sh
echo "📥 WordPress Import"
echo "=================="
echo ""
# 1. Configure
cat > import_config.php << 'EOF'
<?php
return [
'wp_main' => [
'type' => 'mysql',
'host' => 'localhost',
'db' => 'wordpress',
'user' => 'root',
'pass' => 'password'
]
];
EOF
# 2. Extract & Transform
echo "🔄 Extracting..."
php import.php --config=import_config.php
# 3. Embed
echo "🧠 Creating embeddings..."
python3 embed_imports.py
# 4. Verify
echo "✅ Verifying..."
bash verify_import.sh
echo ""
echo "🎉 Import complete!"
Timeline: 15-30 Minuten für 1.000 Posts
Workflow 2: Multiple Sources
#!/bin/bash
# import_multi.sh
echo "📥 Multi-Source Import"
echo "====================="
echo ""
# 1. Configure all sources
cat > sources.json << 'EOF'
{
"wp_ozm": {
"type": "mysql",
"host": "localhost",
"db": "wp_ozm"
},
"wp_legacy": {
"type": "mysql",
"host": "old-server",
"db": "wp_old"
},
"json_backup": {
"type": "json",
"path": "/backups/2024.json"
},
"markdown_docs": {
"type": "files",
"pattern": "/docs/**/*.md"
}
}
EOF
# 2. Import each source
for source in $(jq -r 'keys[]' sources.json); do
echo "📥 Importing: $source"
php import.php --source=$source
echo ""
done
# 3. Batch embed (mehr efficient)
echo "🧠 Batch embedding..."
python3 embed_imports.py --batch-size=100
# 4. Stats
echo ""
echo "📊 Final Stats:"
cat data/kernel.jsonl | grep '"type":"import.post"' | \
jq -r '.payload.source' | sort | uniq -c
echo ""
bash verify_import.sh
Timeline: 3-6 Stunden für 10.000 Posts
Workflow 3: Incremental Import
#!/bin/bash
# import_incremental.sh
echo "📥 Incremental Import (Updates only)"
echo "===================================="
echo ""
# 1. Get last import timestamp
LAST_TS=$(cat data/kernel.jsonl | \
grep '"type":"import.post"' | \
tail -1 | jq -r '.ts')
echo "🕐 Last import: $(date -d @$LAST_TS)"
# 2. Import only new posts
php import.php \
--source=wp_main \
--since="$LAST_TS" \
--incremental
# 3. Embed new only
python3 embed_imports.py --new-only
echo ""
echo "✅ Incremental import complete!"
Timeline: 1-5 Minuten (nur neue Posts)
💾 Import-Größen & Performance
Mini-Import (1 Blog, 1.000 Posts)
Extract: 5 Min
Transform: 10 Min
Log: 2 Min
Embed: 30 Min (lokal Ollama)
───────────────────
TOTAL: 47 Min
Ressourcen:
- CPU: 50% (Embedding)
- RAM: 2GB
- Disk: 500MB (Vektor)
- Token: 0€ ✨
Medium-Import (10 Blogs, 10.000 Posts)
Extract: 30 Min
Transform: 1 Std
Log: 10 Min
Embed: 5 Std (lokal Ollama)
───────────────────
TOTAL: 6.5 Std
Ressourcen:
- CPU: 60% (Embedding)
- RAM: 4GB
- Disk: 5GB (Vektor)
- Token: 0€ ✨
Large-Import (100 Sources, 100.000 Posts)
Extract: 5 Std
Transform: 10 Std
Log: 1 Std
Embed: 50 Std (2 Tage mit GPU)
───────────────────
TOTAL: 3 Tage
Ressourcen:
- CPU: 80% (Embedding)
- RAM: 8GB
- Disk: 50GB (Vektor)
- Token: 0€ ✨
- GPU: Optional (10x schneller!)
Mit GPU (NVIDIA RTX 3060):
- Embedding: 5 Std statt 50 Std
- TOTAL: 8 Std statt 3 Tage!
💰 Kosten-Nutzen-Analyse
Investment (Einmalig)
Zeit:
- Setup: 1-2 Std
- Import: 1-48 Std (je nach Größe)
- Testing: 1-2 Std
Kosten:
- Token: 0€ (lokal Ollama!)
- Strom: 5-50€ (je nach Dauer)
- Hardware: 0€ (vorhanden)
───────────────────────────
TOTAL: 5-50€ einmalig
Return (Lebenslang)
Gespart pro Monat:
- Mini (1k): 20€ Token
- Medium (10k): 50€ Token
- Large (100k): 80€ Token
Break-Even:
- Mini: Tag 8
- Medium: Tag 30
- Large: Tag 23
ROI über 2 Jahre:
- Mini: 100x (20€×24M = 480€)
- Medium: 250x (50€×24M = 1.200€)
- Large: 400x (80€×24M = 1.920€)
Import ist das beste Investment! 🚀
🔍 Quality Control
Automatische Qualitätsprüfung
def check_import_quality(collection_name: str):
"""
Prüft Qualität der importierten Embeddings
"""
qdrant = QdrantClient("http://localhost:6333")
# Sample random points
sample = qdrant.scroll(
collection_name=collection_name,
limit=100,
with_vectors=True
)[0]
# Check für Duplikate
duplicates = 0
for i, point_a in enumerate(sample):
for point_b in sample[i+1:]:
similarity = cosine_similarity(
point_a.vector,
point_b.vector
)
if similarity > 0.98: # Sehr ähnlich
duplicates += 1
# Check für Cluster
vectors = [p.vector for p in sample]
kmeans = KMeans(n_clusters=5).fit(vectors)
cluster_sizes = np.bincount(kmeans.labels_)
print("📊 Quality Report:")
print(f" Sample Size: {len(sample)}")
print(f" Duplicates: {duplicates} ({duplicates/len(sample)*100:.1f}%)")
print(f" Clusters: {len(cluster_sizes)}")
print(f" Largest Cluster: {max(cluster_sizes)} ({max(cluster_sizes)/len(sample)*100:.1f}%)")
# Rating
if duplicates < 5 and max(cluster_sizes) < 50:
print(" Rating: 🟢 EXCELLENT")
elif duplicates < 10 and max(cluster_sizes) < 70:
print(" Rating: 🟡 GOOD")
else:
print(" Rating: 🔴 NEEDS IMPROVEMENT")
📚 Import-Beispiele
Beispiel 1: WordPress mit Tags & Kategorien
// Source: OZM Blog (5.000 Posts)
$adapter = new MySQLAdapter([
'host' => 'localhost',
'db' => 'wp_ozm',
'user' => 'root',
'pass' => 'secret'
]);
$posts = $adapter->getAllPosts();
// → 5.000 Posts mit Tags & Kategorien
foreach ($posts as $post) {
kernel_log('import.post', [
'post_id' => 'wp_ozm_' . $post['ID'],
'title' => $post['post_title'],
'content' => $post['post_content'],
'tags' => explode(',', $post['tags']),
'categories' => explode(',', $post['categories'])
]);
}
// Embed
exec('python3 embed_imports.py');
// Result:
// ✅ 5.000 Embeddings
// ✅ Level 2 erreicht (Tag 1!)
// ✅ 70% Fragen lokal beantwortbar
Beispiel 2: JSON Backup (Legacy Forum)
// backup_2020.json
{
"threads": [
{
"id": 123,
"title": "Wie starte ich mit FPGA?",
"content": "Ich möchte anfangen...",
"author": "user_42",
"created": "2020-03-15",
"replies": 23,
"tags": ["hardware", "fpga", "anfänger"]
},
// ... 2.000 Threads
]
}
// Import
$adapter = new JSONAdapter('/backups/backup_2020.json');
$threads = $adapter->getAllPosts();
foreach ($threads as $thread) {
kernel_log('import.post', [
'post_id' => 'forum_' . $thread['id'],
'title' => $thread['title'],
'content' => $thread['content'],
'tags' => $thread['tags'],
'source' => 'legacy_forum'
]);
}
// Result:
// ✅ 2.000 Embeddings (zusätzlich!)
// ✅ Level 2 → Level 3
// ✅ 80% Fragen lokal
🎯 Best Practices
DO ✅
- Import groß beginnen: 5.000+ Posts wenn möglich
- Qualität vor Quantität: Lieber 1.000 gute als 10.000 schlechte
- Batch Processing: 100er Pakete für Embeddings
- Error Handling: Skip nicht Continue bei Fehlern
- Verify Always: Nach jedem Import testen
- Incremental Updates: Regelmäßig neue Inhalte importieren
DON'T ❌
- Kein Import von Spam: Qualität checken
- Kein Import ohne Backup: Erst sichern!
- Keine Cloud-Embeddings: Lokal Ollama nutzen!
- Nicht alles auf einmal: Sources nacheinander
- Kein Import von Duplikaten: Deduplizieren!
🚀 Zusammenfassung
Import Strategy = Autonomie-Turbo
✅ Tag 1: 5.000+ Embeddings (statt 0)
✅ Woche 1: Level 2 erreicht (statt Monat 6)
✅ Monat 6: Level 4 erreicht (statt Jahr 2)
✅ Kosten: 56% Ersparnis über 2 Jahre
✅ Token: 0€ für Import (lokal Ollama)
Der alte "Ballast":
- Ist kein Ballast
- Ist ein Geschenk
- Ist die Sternenkarte
- Ist der Boost ins Autonomie
Wuuuhuuuu! 🦉✨
Version: 1.0
Lizenz: CKL
Import: Empfohlen
Status: Ready ✨