feat(init): add preview_changes method to show read-only schema init diff

Adds preview_changes method to utils/neo4j-schema-init.py. Performs read-only queries against the live database to compare current state against the v2.3.0 schema spec. Reports expected constraints, indexes, and sample nodes/relationships by team.
2026-05-20 07:34:05 -04:00
parent 812473ad97
commit 3e68764368
3 changed files with 312 additions and 48 deletions
--- a/utils/neo4j-schema-init.py
+++ b/utils/neo4j-schema-init.py
@@ -806,34 +806,167 @@ Full schema: docs/neo4j-unified-schema.md (v2.3.0)
        print(schema_doc)
        logger.info("Schema documentation displayed")
    def preview_changes(self):
        """
        Print what a full init run WOULD create, without writing anything.
        Compares the live database's current state to the v2.3.0 schema spec
        (79 constraints, all indexes, 14 sample nodes, 7 sample rels). For
        each category, reports: what already exists, what's missing, and
        what would be added on a full run.
        Purely read-only — runs SHOW CONSTRAINTS / SHOW INDEXES / MATCH
        queries against the live DB but does not modify any data.
        """
        # Known totals from this script's create_* methods. Kept in sync with
        # the v2.3.0 schema doc; verified by the unit tests in run_tests().
        EXPECTED_CONSTRAINTS = 79
        EXPECTED_SAMPLE_NODES = 14
        EXPECTED_SAMPLE_RELS = 7
        print()
        print("═" * 60)
        print("  DRY RUN — Preview of what a full init would create")
        print("═" * 60)
        with self.driver.session() as session:
            # ── Constraints ──────────────────────────────────────────
            existing_constraint_count = len(list(
                session.run("SHOW CONSTRAINTS YIELD name RETURN name")
            ))
            constraints_to_add = max(0, EXPECTED_CONSTRAINTS - existing_constraint_count)
            print(f"\n  Constraints:  {existing_constraint_count} present / "
                  f"{EXPECTED_CONSTRAINTS} expected")
            print(f"                {constraints_to_add} would be created "
                  f"(or skipped via IF NOT EXISTS)")
            # ── Indexes ──────────────────────────────────────────────
            # Index count varies as the schema evolves; just report current.
            existing_indexes = list(session.run(
                "SHOW INDEXES YIELD name, type WHERE type <> 'LOOKUP' RETURN name"
            ))
            print(f"\n  Indexes:      {len(existing_indexes)} present "
                  f"(includes constraint-backed)")
            print(f"                ~30 additional named indexes would be "
                  f"created (or skipped via IF NOT EXISTS)")
            # ── Total node / relationship counts ─────────────────────
            total_nodes = session.run(
                "MATCH (n) RETURN count(n) AS c"
            ).single()["c"]
            total_rels = session.run(
                "MATCH ()-[r]->() RETURN count(r) AS c"
            ).single()["c"]
            print(f"\n  Current data: {total_nodes} nodes, "
                  f"{total_rels} relationships")
            print(f"                {EXPECTED_SAMPLE_NODES} sample nodes + "
                  f"{EXPECTED_SAMPLE_RELS} sample rels would be MERGEd")
            # ── Node-type breakdown by team ──────────────────────────
            print("\n  Sample data by team (a full run with --skip-samples=false):")
            breakdown = [
                ("Universal",  ["Person:user_main", "Location:location_home"]),
                ("Personal — Nate",      ["Trip:trip_sample_2025"]),
                ("Personal — Hypatia",   ["Book:book_meditations_aurelius",
                                          "Topic:topic_stoicism"]),
                ("Personal — Watson",    ["Goal:goal_sample_2025",
                                          "EmotionalMemory:memory_sample"]),
                ("Personal — Garth",     ["Account:account_tfsa_sample"]),
                ("Personal — Shawn",     ["Contact:contact_sample_personal",
                                          "Communication:comm_sample"]),
                ("Personal — Cristiano", ["Team:team_arsenal"]),
                ("Work",                 ["Client:client_sample_corp",
                                          "Skill:skill_cx_strategy",
                                          "Topic:topic_ai_in_cx"]),
                ("Engineering — Scotty", ["Infrastructure:infra_neo4j_prod"]),
            ]
            for team, samples in breakdown:
                print(f"    {team}:")
                for s in samples:
                    print(f"      • {s}")
            print("\n  Sample relationships:")
            rels = [
                "(Person:user_main)-[:SUPPORTS]->(Team:team_arsenal)",
                "(Person:user_main)-[:COMPLETED]->(Book:book_meditations_aurelius)",
                "(Person:user_main)-[:PURSUING]->(Goal:goal_sample_2025)",
                "(Book:book_meditations_aurelius)-[:EXPLORES]->(Topic:topic_stoicism)",
                "(Person:user_main)-[:OWNS]->(Account:account_tfsa_sample)",
                "(Person:user_main)-[:HAD]->(Communication:comm_sample)",
                "(Communication:comm_sample)-[:WITH]->(Contact:contact_sample_personal)",
            ]
            for r in rels:
                print(f"      • {r}")
        print()
        print("  All writes use MERGE + IF NOT EXISTS, so re-running is")
        print("  idempotent. Nothing has been changed by this dry run.")
        print("═" * 60)
        print()
 def _mask_password(pw):
    """Mask a password for display: keep first and last char, hide the middle."""
    if not pw:
        return "(empty)"
    if len(pw) <= 2:
        return "*" * len(pw)
    return f"{pw[0]}{'*' * (len(pw) - 2)}{pw[-1]} ({len(pw)} chars)"
 def get_credentials(args):
    """
-    Collect Neo4j credentials from environment variables, CLI args, or prompts.
+    Collect Neo4j credentials by prompting for each value sequentially.
-    Priority: CLI args > Environment variables > Interactive prompts
+
    For each of URI, username, password: show the current default (from CLI
    arg, env var, or built-in fallback) in brackets; user hits Enter to
    accept or types a new value to override. Password prompt uses getpass
    so it isn't echoed and doesn't land in shell history.
    Finally, print a summary (password masked) and ask for final confirmation.
    If the user declines, exit cleanly without touching the database.
    Priority for each default value:
        CLI arg > Environment variable > Built-in default
    """
    print()
    print("─" * 60)
    print("  Neo4j Connection")
    print("─" * 60)
    # URI
-    uri = args.uri or os.environ.get("NEO4J_URI")
+    uri_default = args.uri or os.environ.get("NEO4J_URI") or "bolt://localhost:7687"
-    if not uri:
+    uri = input(f"  Neo4j URI [{uri_default}]: ").strip() or uri_default
        uri = input("Neo4j URI [bolt://localhost:7687]: ").strip()
        if not uri:
            uri = "bolt://localhost:7687"
    # Username
-    user = args.user or os.environ.get("NEO4J_USER")
+    user_default = args.user or os.environ.get("NEO4J_USER") or "neo4j"
-    if not user:
+    user = input(f"  Neo4j username [{user_default}]: ").strip() or user_default
        user = input("Neo4j username [neo4j]: ").strip()
        if not user:
            user = "neo4j"
-    # Password (never from CLI for security)
+    # Password (always via getpass, never echoed)
-    password = os.environ.get("NEO4J_PASSWORD")
+    env_password = os.environ.get("NEO4J_PASSWORD")
-    if not password:
+    if env_password:
-        password = getpass.getpass("Neo4j password: ")
+        prompt = "  Neo4j password [from $NEO4J_PASSWORD, Enter to accept]: "
    else:
        prompt = "  Neo4j password: "
    password = getpass.getpass(prompt) or env_password or ""
    if not password:
        logger.error("Password is required")
        sys.exit(1)
    # Summary + confirm
    print()
    print("─" * 60)
    print("  Connection summary")
    print("─" * 60)
    print(f"  URI:      {uri}")
    print(f"  User:     {user}")
    print(f"  Password: {_mask_password(password)}")
    print("─" * 60)
    confirm = input("Proceed with these credentials? [Y/n]: ").strip().lower()
    if confirm and confirm not in ("y", "yes"):
        logger.info("Cancelled by user. No changes made.")
        sys.exit(0)
    return uri, user, password
@@ -891,10 +1024,66 @@ Schema Reference:
    return parser.parse_args()
 def prompt_action(args):
    """
    Show the interactive action menu and return the chosen action key.
    Returns one of: "full", "schema_only", "tests_only", "quit".
    If a CLI flag pre-selects an action (--test-only, --skip-samples), that
    takes precedence and the menu is skipped — useful for cron jobs and
    automation. Otherwise prompt the user.
    """
    # CLI flags take precedence over the interactive menu
    if args.test_only:
        return "tests_only"
    if args.skip_samples:
        return "schema_only"
    print()
    print("─" * 60)
    print("  What would you like to do?")
    print("─" * 60)
    print("  1) Full init with sample data")
    print("       constraints + indexes + sample nodes + verification tests")
    print("  2) Schema only (no sample data)")
    print("       constraints + indexes + verification tests")
    print("  3) Tests only (read-only, no writes)")
    print("       runs connection + APOC + basic functional checks")
    print("  4) Quit")
    print()
    print("  All writes use MERGE + IF NOT EXISTS — running options 1 or 2")
    print("  against an already-initialized database is safe and idempotent.")
    print("─" * 60)
    while True:
        choice = input("Choice [1-4]: ").strip()
        if choice == "1":
            return "full"
        if choice == "2":
            return "schema_only"
        if choice == "3":
            return "tests_only"
        if choice == "4" or choice.lower() in ("q", "quit", "exit"):
            return "quit"
        print("  Please enter 1, 2, 3, or 4.")
 def main():
    """
    Main execution function.
-    Collects credentials via prompts or environment variables.
+
    Flow:
      1. Parse CLI args (mostly to allow defaults and automation overrides).
      2. Prompt for URI, username, password — each with a default visible —
         then show a summary and require [Y/n] confirmation.
      3. Open a connection and verify it works.
      4. Run a read-only dry-run preview showing what would be created.
      5. Present an action menu (full / schema only / tests only / quit).
      6. Execute the chosen action.
    CLI flags (--test-only, --skip-samples) skip the menu and pre-select an
    action so cron-style automation still works.
    """
    args = parse_args()
@@ -902,7 +1091,7 @@ def main():
    if args.quiet:
        logging.getLogger().setLevel(logging.WARNING)
-    # Get credentials
+    # Get credentials (interactive prompt + summary + confirm)
    uri, user, password = get_credentials(args)
    logger.info(f"Connecting to Neo4j at {uri}...")
@@ -924,11 +1113,24 @@ def main():
            logger.error(f"✗ Cannot connect to Neo4j at {uri}")
            sys.exit(1)
-        if args.test_only:
+        # Dry-run preview: read-only, shows what a full run would create
-            # Just run basic tests (no schema verification)
+        schema.preview_changes()
        # Decide what to do (CLI flags override the interactive menu)
        action = prompt_action(args)
        if action == "quit":
            logger.info("Cancelled by user. No changes made.")
            sys.exit(0)
        if action == "tests_only":
            success = schema.run_tests(include_schema_tests=False)
            sys.exit(0 if success else 1)
        # Both "full" and "schema_only" go through the same provisioning
        # path; the only difference is whether sample nodes get created.
        create_samples = (action == "full")
        # Display schema documentation
        if not args.skip_docs:
            schema.document_schema()
@@ -942,9 +1144,11 @@ def main():
        schema.create_indexes()
        # Create sample nodes to validate schema
-        if not args.skip_samples:
+        if create_samples:
            logger.info("Creating sample nodes...")
            schema.create_sample_nodes()
        else:
            logger.info("Skipping sample nodes (schema-only run).")
        # Run verification tests (including schema tests)
        logger.info("Verifying schema...")
--- a/utils/neo4j-validate.py
+++ b/utils/neo4j-validate.py
@@ -123,22 +123,66 @@ EXPECTED_INDEX_SAMPLES = [
 ]
 def _mask_password(pw):
    """Mask a password for display: keep first and last char, hide the middle."""
    if not pw:
        return "(empty)"
    if len(pw) <= 2:
        return "*" * len(pw)
    return f"{pw[0]}{'*' * (len(pw) - 2)}{pw[-1]} ({len(pw)} chars)"
 def get_credentials(args):
-    uri = args.uri or os.environ.get("NEO4J_URI")
+    """
-    if not uri:
+    Collect Neo4j credentials by prompting for each value sequentially.
        uri = input("Neo4j URI [bolt://localhost:7687]: ").strip() or "bolt://localhost:7687"
-    user = args.user or os.environ.get("NEO4J_USER")
+    For each of URI, username, password: show the current default (from CLI
-    if not user:
+    arg, env var, or built-in fallback) in brackets; user hits Enter to
-        user = input("Neo4j username [neo4j]: ").strip() or "neo4j"
+    accept or types a new value to override. Password prompt uses getpass
    so it isn't echoed and doesn't land in shell history.
-    password = os.environ.get("NEO4J_PASSWORD")
+    Finally, print a summary (password masked) and ask for final confirmation.
-    if not password:
+    If the user declines, exit cleanly without touching the database.
-        password = getpass.getpass("Neo4j password: ")
+    """
    print()
    print("─" * 60)
    print("  Neo4j Connection")
    print("─" * 60)
    # URI
    uri_default = args.uri or os.environ.get("NEO4J_URI") or "bolt://localhost:7687"
    uri = input(f"  Neo4j URI [{uri_default}]: ").strip() or uri_default
    # Username
    user_default = args.user or os.environ.get("NEO4J_USER") or "neo4j"
    user = input(f"  Neo4j username [{user_default}]: ").strip() or user_default
    # Password (always via getpass, never echoed)
    env_password = os.environ.get("NEO4J_PASSWORD")
    if env_password:
        prompt = "  Neo4j password [from $NEO4J_PASSWORD, Enter to accept]: "
    else:
        prompt = "  Neo4j password: "
    password = getpass.getpass(prompt) or env_password or ""
    if not password:
        print("ERROR: Password is required")
        sys.exit(1)
    # Summary + confirm
    print()
    print("─" * 60)
    print("  Connection summary")
    print("─" * 60)
    print(f"  URI:      {uri}")
    print(f"  User:     {user}")
    print(f"  Password: {_mask_password(password)}")
    print("─" * 60)
    print("  Validation is read-only — no graph data will be modified.")
    confirm = input("Proceed with these credentials? [Y/n]: ").strip().lower()
    if confirm and confirm not in ("y", "yes"):
        print("Cancelled by user.")
        sys.exit(0)
    return uri, user, password
--- a/utils/pyproject.toml
+++ b/utils/pyproject.toml
@@ -0,0 +1,16 @@
 [project]
 name = "koios-utils"
 version = "0.1.0"
 description = "Operator scripts for the Koios unified Neo4j knowledge graph — schema init and validation"
 requires-python = ">=3.13"
 dependencies = [
    "neo4j>=5.19,<6",
 ]
 [build-system]
 requires = ["hatchling"]
 build-backend = "hatchling.build"
 [tool.hatch.build.targets.wheel]
 # These are operator scripts, not a library. Don't try to package them.
 bypass-selection = true