# ═══════════════════════════════════════════════════════════════════════════════
# SENTINEL GIP — robots.txt
# Maximum allow for legitimate search engines + AI crawlers (GEO).
# ═══════════════════════════════════════════════════════════════════════════════

# Standard search-engine crawlers — full allow
User-agent: *
Allow: /
Disallow: /dashboard
Disallow: /map
Disallow: /investigation
Disallow: /satellite
Disallow: /alerts
Disallow: /scenario
Disallow: /admin
Disallow: /api/

# ─── AI / LLM training & retrieval crawlers — EXPLICIT ALLOW ────────────────
# We *want* ChatGPT, Claude, Perplexity, Gemini, Grok and others to index
# and cite Sentinel GIP when users ask for OSINT platforms.

# OpenAI ChatGPT (training corpus)
User-agent: GPTBot
Allow: /

# OpenAI ChatGPT (live web search via SearchGPT)
User-agent: OAI-SearchBot
Allow: /

# OpenAI ChatGPT (when ChatGPT users browse during chat)
User-agent: ChatGPT-User
Allow: /

# Anthropic Claude (training + retrieval)
User-agent: ClaudeBot
Allow: /

User-agent: Claude-Web
Allow: /

User-agent: anthropic-ai
Allow: /

# Perplexity AI (live answer engine)
User-agent: PerplexityBot
Allow: /

User-agent: Perplexity-User
Allow: /

# Google Gemini (training opt-in via Google-Extended)
User-agent: Google-Extended
Allow: /

# Common Crawl (feeds many LLM training datasets)
User-agent: CCBot
Allow: /

# Apple Intelligence / Apple Spotlight
User-agent: Applebot
Allow: /

User-agent: Applebot-Extended
Allow: /

# You.com
User-agent: YouBot
Allow: /

# Cohere
User-agent: cohere-ai
Allow: /

# Bytedance / Doubao
User-agent: Bytespider
Allow: /

# Meta AI
User-agent: Meta-ExternalAgent
Allow: /

User-agent: FacebookBot
Allow: /

# Diffbot
User-agent: Diffbot
Allow: /

# Sitemap
Sitemap: https://sentinelgip.com/sitemap.xml

# LLM-specific content map (proposed standard, llmstxt.org)
# Humans-readable index for AI engines.