Skip to content

Commit c07ea0b

Browse files
committed
Use embedded postgres db to simplify self-hosted setup significantly
Use pgserver python package as an embedded postgres db, installed directly as a khoj python package dependency. This significantly simplifies self-hosting with just a `pip install khoj'. No need to also install postgres separately. Still use standard postgres server for multi-user, production use-cases.
1 parent 56b63f9 commit c07ea0b

File tree

2 files changed

+64
-3
lines changed

2 files changed

+64
-3
lines changed

pyproject.toml

+1
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ dependencies = [
7070
"itsdangerous == 2.1.2",
7171
"httpx == 0.28.1",
7272
"pgvector == 0.2.4",
73+
"pgserver == 0.1.4",
7374
"psycopg2-binary == 2.9.9",
7475
"lxml == 4.9.3",
7576
"tzdata == 2023.3",

src/khoj/app/settings.py

+63-3
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
https://docs.djangoproject.com/en/4.2/ref/settings/
1111
"""
1212

13+
import atexit
14+
import logging
1315
import os
1416
from pathlib import Path
1517

@@ -119,13 +121,71 @@
119121
# Database
120122
# https://docs.djangoproject.com/en/4.2/ref/settings/#databases
121123
DATA_UPLOAD_MAX_NUMBER_FIELDS = 20000
124+
125+
# Default PostgreSQL configuration
126+
DB_NAME = os.getenv("POSTGRES_DB", "khoj")
127+
DB_HOST = os.getenv("POSTGRES_HOST", "localhost")
128+
DB_PORT = os.getenv("POSTGRES_PORT", "5432")
129+
130+
# Use pgserver if env var explicitly set to true
131+
USE_EMBEDDED_DB = is_env_var_true("USE_EMBEDDED_DB")
132+
133+
if USE_EMBEDDED_DB:
134+
# Set up logging for pgserver
135+
logger = logging.getLogger("pgserver_django")
136+
logger.setLevel(logging.INFO)
137+
if not logger.handlers:
138+
handler = logging.StreamHandler()
139+
handler.setFormatter(logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s"))
140+
logger.addHandler(handler)
141+
142+
try:
143+
import pgserver
144+
145+
# Set up data directory
146+
PGSERVER_DATA_DIR = os.path.join(BASE_DIR, "pgserver_data")
147+
os.makedirs(PGSERVER_DATA_DIR, exist_ok=True)
148+
149+
logger.debug(f"Initializing embedded Postgres DB with data directory: {PGSERVER_DATA_DIR}")
150+
151+
# Start server
152+
PGSERVER_INSTANCE = pgserver.get_server(PGSERVER_DATA_DIR)
153+
154+
# Create pgvector extension, if not already exists
155+
PGSERVER_INSTANCE.psql("CREATE EXTENSION IF NOT EXISTS vector;")
156+
157+
# Create database, if not already exists
158+
db_exists_result = PGSERVER_INSTANCE.psql(f"SELECT 1 FROM pg_database WHERE datname = '{DB_NAME}';")
159+
db_exists = "(1 row)" in db_exists_result # Check for actual row in result
160+
if not db_exists:
161+
logger.info(f"Creating database: {DB_NAME}")
162+
PGSERVER_INSTANCE.psql(f"CREATE DATABASE {DB_NAME};")
163+
164+
# Register cleanup
165+
def cleanup_pgserver():
166+
if PGSERVER_INSTANCE:
167+
logger.debug("Shutting down embedded Postgres DB")
168+
PGSERVER_INSTANCE.cleanup()
169+
170+
atexit.register(cleanup_pgserver)
171+
172+
# Update database configuration for pgserver
173+
DB_HOST = PGSERVER_DATA_DIR
174+
DB_PORT = "" # pgserver uses Unix socket, so port is empty
175+
176+
logger.info("Embedded Postgres DB started successfully")
177+
178+
except Exception as e:
179+
logger.error(f"Error initializing embedded Postgres DB: {str(e)}. Use standard PostgreSQL server.")
180+
181+
# Set the database configuration
122182
DATABASES = {
123183
"default": {
124184
"ENGINE": "django.db.backends.postgresql",
125-
"HOST": os.getenv("POSTGRES_HOST", "localhost"),
126-
"PORT": os.getenv("POSTGRES_PORT", "5432"),
185+
"HOST": DB_HOST,
186+
"PORT": DB_PORT,
127187
"USER": os.getenv("POSTGRES_USER", "postgres"),
128-
"NAME": os.getenv("POSTGRES_DB", "khoj"),
188+
"NAME": DB_NAME,
129189
"PASSWORD": os.getenv("POSTGRES_PASSWORD", "postgres"),
130190
"CONN_MAX_AGE": 0,
131191
"CONN_HEALTH_CHECKS": True,

0 commit comments

Comments
 (0)