From d4891c59eb5413d94932224a04cc7731b6016ead Mon Sep 17 00:00:00 2001 From: Masaki Yatsu Date: Tue, 9 Sep 2025 21:38:35 +0900 Subject: [PATCH] feat(datahub): add DataHub --- README.md | 21 ++ datahub/.gitignore | 4 + ...hub-database-external-secret.gomplate.yaml | 27 ++ ...atahub-oauth-external-secret.gomplate.yaml | 27 ++ ...datahub-prerequisites-values.gomplate.yaml | 80 ++++++ datahub/datahub-values.gomplate.yaml | 210 ++++++++++++++++ datahub/justfile | 235 ++++++++++++++++++ 7 files changed, 604 insertions(+) create mode 100644 datahub/.gitignore create mode 100644 datahub/datahub-database-external-secret.gomplate.yaml create mode 100644 datahub/datahub-oauth-external-secret.gomplate.yaml create mode 100644 datahub/datahub-prerequisites-values.gomplate.yaml create mode 100644 datahub/datahub-values.gomplate.yaml create mode 100644 datahub/justfile diff --git a/README.md b/README.md index 0b83271..0ba02e3 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ A Kubernetes development stack for self-hosted environments, designed to run on - **Object Storage**: MinIO S3-compatible storage - **Data Science**: JupyterHub for collaborative notebooks - **Analytics**: Metabase for business intelligence and data visualization +- **Data Catalog**: DataHub for metadata management and data discovery - **Remote Access**: Cloudflare Tunnel for secure internet connectivity - **Automation**: Just task runner with templated configurations @@ -136,6 +137,26 @@ just metabase::install Access Metabase at `https://metabase.yourdomain.com` and complete the initial setup wizard to create an admin account. +### DataHub + +Modern data catalog and metadata management platform: + +- Centralized data discovery and documentation +- Data lineage tracking and impact analysis +- Schema evolution monitoring +- OIDC integration with Keycloak for secure access +- Elasticsearch-powered search and indexing +- Kafka-based real-time metadata streaming +- PostgreSQL backend for metadata storage + +Installation: + +```bash +just datahub::install +``` + +Access DataHub at `https://datahub.yourdomain.com` and use "Sign in with SSO" to authenticate via Keycloak. + ## Common Operations ### User Management diff --git a/datahub/.gitignore b/datahub/.gitignore new file mode 100644 index 0000000..42c8c59 --- /dev/null +++ b/datahub/.gitignore @@ -0,0 +1,4 @@ +datahub-prerequisites-values.yaml +datahub-values.yaml +datahub-database-external-secret.yaml +datahub-oauth-external-secret.yaml diff --git a/datahub/datahub-database-external-secret.gomplate.yaml b/datahub/datahub-database-external-secret.gomplate.yaml new file mode 100644 index 0000000..914258d --- /dev/null +++ b/datahub/datahub-database-external-secret.gomplate.yaml @@ -0,0 +1,27 @@ +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: datahub-database-external-secret + namespace: {{ .Env.DATAHUB_NAMESPACE }} +spec: + refreshInterval: 1h + secretStoreRef: + name: vault-secret-store + kind: ClusterSecretStore + target: + name: datahub-database-secret + creationPolicy: Owner + template: + type: Opaque + data: + username: "{{ `{{ .username }}` }}" + password: "{{ `{{ .password }}` }}" + data: + - secretKey: username + remoteRef: + key: datahub/database + property: username + - secretKey: password + remoteRef: + key: datahub/database + property: password \ No newline at end of file diff --git a/datahub/datahub-oauth-external-secret.gomplate.yaml b/datahub/datahub-oauth-external-secret.gomplate.yaml new file mode 100644 index 0000000..b517486 --- /dev/null +++ b/datahub/datahub-oauth-external-secret.gomplate.yaml @@ -0,0 +1,27 @@ +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: datahub-oauth-external-secret + namespace: {{ .Env.DATAHUB_NAMESPACE }} +spec: + refreshInterval: 1h + secretStoreRef: + name: vault-secret-store + kind: ClusterSecretStore + target: + name: datahub-oauth-secret + creationPolicy: Owner + template: + type: Opaque + data: + client_id: "{{ `{{ .client_id }}` }}" + client_secret: "{{ `{{ .client_secret }}` }}" + data: + - secretKey: client_id + remoteRef: + key: datahub/oauth + property: client_id + - secretKey: client_secret + remoteRef: + key: datahub/oauth + property: client_secret \ No newline at end of file diff --git a/datahub/datahub-prerequisites-values.gomplate.yaml b/datahub/datahub-prerequisites-values.gomplate.yaml new file mode 100644 index 0000000..a2a9a8d --- /dev/null +++ b/datahub/datahub-prerequisites-values.gomplate.yaml @@ -0,0 +1,80 @@ +# DataHub Prerequisites Values +# External services required by DataHub + +# MySQL disabled - using PostgreSQL +mysql: + enabled: false + +# PostgreSQL configuration (external) +postgresql: + enabled: false + +# Elasticsearch configuration +elasticsearch: + enabled: true + replicas: 1 + minimumMasterNodes: 1 + # Use newer version to fix cgroup v2 compatibility (7.17.3 → 7.17.26) + image: "docker.elastic.co/elasticsearch/elasticsearch" + imageTag: "7.17.26" + # Resource limits for development + resources: + requests: + cpu: "100m" + memory: "512Mi" + limits: + cpu: "1000m" + memory: "2Gi" + # Persistence + persistence: + enabled: true + size: "10Gi" + # Security configuration + esConfig: + elasticsearch.yml: | + xpack.security.enabled: false + xpack.security.transport.ssl.enabled: false + xpack.security.http.ssl.enabled: false + +# Kafka configuration +kafka: + enabled: true + replicaCount: 1 + # Resource limits + resources: + requests: + cpu: "100m" + memory: "512Mi" + limits: + cpu: "500m" + memory: "1Gi" + # Persistence + persistence: + enabled: true + size: "10Gi" + # Zookeeper configuration + zookeeper: + enabled: true + replicaCount: 1 + resources: + requests: + cpu: "100m" + memory: "256Mi" + limits: + cpu: "500m" + memory: "512Mi" + persistence: + enabled: true + size: "5Gi" + +# Schema Registry +cp-schema-registry: + enabled: true + replicaCount: 1 + resources: + requests: + cpu: "100m" + memory: "512Mi" + limits: + cpu: "500m" + memory: "1Gi" \ No newline at end of file diff --git a/datahub/datahub-values.gomplate.yaml b/datahub/datahub-values.gomplate.yaml new file mode 100644 index 0000000..a85b794 --- /dev/null +++ b/datahub/datahub-values.gomplate.yaml @@ -0,0 +1,210 @@ +# DataHub Main Application Values +# Core DataHub services configuration + +# Global settings +global: + datahub: + version: {{ .Env.DATAHUB_VERSION }} + monitoring: + enablePrometheus: true + + # Kafka configuration + kafka: + bootstrap: + server: "datahub-prerequisites-kafka:9092" + zookeeper: + server: "datahub-prerequisites-zookeeper:2181" + + # Global database configuration + sql: + datasource: + host: "postgres-cluster-rw.postgres.svc.cluster.local:5432" + hostForPostgresClient: "postgres-cluster-rw.postgres.svc.cluster.local" + hostForpostgresqlClient: "postgres-cluster-rw.postgres.svc.cluster.local" + port: "5432" + database: "datahub" + username: "datahub" + password: + secretRef: "datahub-database-secret" + secretKey: "password" + driver: "org.postgresql.Driver" + url: "jdbc:postgresql://postgres-cluster-rw.postgres.svc.cluster.local:5432/datahub?sslmode=require" + +# Database configuration (PostgreSQL) +datahub-gms: + enabled: true + replicaCount: 1 + + # Authentication configuration - using extraEnvs for OIDC + extraEnvs: + - name: AUTH_OIDC_ENABLED + value: "true" + - name: AUTH_OIDC_CLIENT_ID + valueFrom: + secretKeyRef: + name: datahub-oauth-secret + key: client_id + - name: AUTH_OIDC_CLIENT_SECRET + valueFrom: + secretKeyRef: + name: datahub-oauth-secret + key: client_secret + - name: AUTH_OIDC_DISCOVERY_URI + value: "https://{{ .Env.KEYCLOAK_HOST }}/realms/{{ .Env.KEYCLOAK_REALM }}/.well-known/openid-configuration" + - name: AUTH_OIDC_BASE_URL + value: "https://{{ .Env.DATAHUB_HOST }}" + + # Service configuration + service: + type: ClusterIP + + # Resource configuration + resources: + requests: + cpu: "500m" + memory: "512Mi" + limits: + cpu: "2000m" + memory: "4Gi" + + # JVM configuration + env: + - name: JAVA_OPTS + value: "-Xms1g -Xmx3g" + +# Frontend service +datahub-frontend: + enabled: true + replicaCount: 1 + + # Authentication configuration - using extraEnvs for OIDC + extraEnvs: + - name: AUTH_OIDC_ENABLED + value: "true" + - name: AUTH_OIDC_CLIENT_ID + valueFrom: + secretKeyRef: + name: datahub-oauth-secret + key: client_id + - name: AUTH_OIDC_CLIENT_SECRET + valueFrom: + secretKeyRef: + name: datahub-oauth-secret + key: client_secret + - name: AUTH_OIDC_DISCOVERY_URI + value: "https://{{ .Env.KEYCLOAK_HOST }}/realms/{{ .Env.KEYCLOAK_REALM }}/.well-known/openid-configuration" + - name: AUTH_OIDC_BASE_URL + value: "https://{{ .Env.DATAHUB_HOST }}" + + # Resource configuration + # Service configuration + service: + type: ClusterIP + + # Ingress configuration + ingress: + enabled: true + ingressClassName: traefik + annotations: + kubernetes.io/ingress.class: traefik + traefik.ingress.kubernetes.io/router.entrypoints: websecure + hosts: + - host: {{ .Env.DATAHUB_HOST }} + paths: + - / + tls: + - hosts: + - {{ .Env.DATAHUB_HOST }} + + resources: + requests: + cpu: "200m" + memory: "256Mi" + limits: + cpu: "1000m" + memory: "2Gi" + +# Actions service +datahub-actions: + enabled: true + replicaCount: 1 + + # Resource configuration + resources: + requests: + cpu: "200m" + memory: "256Mi" + limits: + cpu: "1000m" + memory: "1Gi" + +# MCE Consumer +datahub-mce-consumer: + enabled: true + replicaCount: 1 + + # Resource configuration + resources: + requests: + cpu: "200m" + memory: "256Mi" + limits: + cpu: "1000m" + memory: "1Gi" + +# MAE Consumer +datahub-mae-consumer: + enabled: true + replicaCount: 1 + + # Resource configuration + resources: + requests: + cpu: "200m" + memory: "256Mi" + limits: + cpu: "1000m" + memory: "1Gi" + +# Setup Jobs +# DataHub's built-in PostgreSQL setup job handles schema initialization and migrations +# Our justfile ensures the database and user exist with proper permissions +mysqlSetupJob: + enabled: false + +postgresqlSetupJob: + enabled: true + host: "postgres-cluster-rw.postgres.svc.cluster.local" + hostForpostgresqlClient: "postgres-cluster-rw.postgres.svc.cluster.local" + port: "5432" + url: "jdbc:postgresql://postgres-cluster-rw.postgres.svc.cluster.local:5432/datahub" + database: "datahub" + username: "datahub" + password: + secretRef: "datahub-database-secret" + secretKey: "password" + # Allow DataHub to handle schema migrations for existing databases + extraInitContainers: [] + # Configure job to be idempotent for existing databases + jobAnnotations: + "helm.sh/hook": pre-install,pre-upgrade + "helm.sh/hook-weight": "-5" + "helm.sh/hook-delete-policy": before-hook-creation + +# External services configuration +elasticsearch: + # Use prerequisites elasticsearch + host: "prerequisites-elasticsearch-master:9200" + +kafka: + # Use prerequisites kafka + bootstrap: + server: "prerequisites-cp-kafka:9092" + schemaregistry: + url: "http://prerequisites-cp-schema-registry:8081" + +# Disable local services (use prerequisites) +mysql: + enabled: false +postgresql: + enabled: false diff --git a/datahub/justfile b/datahub/justfile new file mode 100644 index 0000000..01980c3 --- /dev/null +++ b/datahub/justfile @@ -0,0 +1,235 @@ +set fallback := true + +export DATAHUB_NAMESPACE := env("DATAHUB_NAMESPACE", "datahub") +export DATAHUB_CHART_VERSION := env("DATAHUB_CHART_VERSION", "0.6.21") +export DATAHUB_PREREQUISITES_CHART_VERSION := env("DATAHUB_PREREQUISITES_CHART_VERSION", "0.1.15") +export DATAHUB_VERSION := env("DATAHUB_VERSION", "v1.2.0") +export DATAHUB_HOST := env("DATAHUB_HOST", "") +export EXTERNAL_SECRETS_NAMESPACE := env("EXTERNAL_SECRETS_NAMESPACE", "external-secrets") +export KEYCLOAK_REALM := env("KEYCLOAK_REALM", "buunstack") + +[private] +default: + @just --list --unsorted --list-submodules + +# Add Helm repository +add-helm-repo: + helm repo add datahub https://helm.datahubproject.io/ + helm repo update + +# Remove Helm repository +remove-helm-repo: + helm repo remove datahub + +# Create DataHub namespace +create-namespace: + @kubectl get namespace ${DATAHUB_NAMESPACE} &>/dev/null || \ + kubectl create namespace ${DATAHUB_NAMESPACE} + +# Delete DataHub namespace +delete-namespace: + @kubectl delete namespace ${DATAHUB_NAMESPACE} --ignore-not-found + +# Setup database for DataHub +setup-database: + #!/bin/bash + set -euo pipefail + echo "Setting up DataHub database..." + + if just postgres::db-exists datahub &>/dev/null; then + echo "Database 'datahub' already exists. DataHub will handle schema migrations." + else + echo "Creating new database 'datahub'..." + just postgres::create-db datahub + fi + + # Generate password for user creation/update + # For existing users, preserve existing password if possible + if just postgres::user-exists datahub &>/dev/null; then + echo "User 'datahub' already exists." + # Check if we can get existing password from Vault/Secret + if helm status external-secrets -n ${EXTERNAL_SECRETS_NAMESPACE} &>/dev/null; then + # Try to get existing password from Vault + if DB_PASSWORD=$(just vault::get datahub/database password 2>/dev/null); then + echo "Using existing password from Vault." + else + echo "Generating new password and updating Vault..." + DB_PASSWORD=$(just utils::random-password) + just postgres::psql -c "ALTER USER datahub WITH PASSWORD '$DB_PASSWORD';" + fi + else + # For direct Secret approach, generate new password + echo "Generating new password for existing user..." + DB_PASSWORD=$(just utils::random-password) + just postgres::psql -c "ALTER USER datahub WITH PASSWORD '$DB_PASSWORD';" + fi + else + echo "Creating new user 'datahub'..." + DB_PASSWORD=$(just utils::random-password) + just postgres::create-user datahub "$DB_PASSWORD" + fi + + echo "Ensuring database permissions..." + just postgres::grant datahub datahub + + if helm status external-secrets -n ${EXTERNAL_SECRETS_NAMESPACE} &>/dev/null; then + echo "External Secrets available. Storing credentials in Vault and creating ExternalSecret..." + just vault::put datahub/database username=datahub password="$DB_PASSWORD" + gomplate -f datahub-database-external-secret.gomplate.yaml -o datahub-database-external-secret.yaml + kubectl apply -f datahub-database-external-secret.yaml + echo "Waiting for database secret to be ready..." + kubectl wait --for=condition=Ready externalsecret/datahub-database-external-secret \ + -n ${DATAHUB_NAMESPACE} --timeout=60s + else + echo "External Secrets not available. Creating Kubernetes Secret directly..." + kubectl delete secret datahub-database-secret -n ${DATAHUB_NAMESPACE} --ignore-not-found + kubectl create secret generic datahub-database-secret -n ${DATAHUB_NAMESPACE} \ + --from-literal=username=datahub \ + --from-literal=password="$DB_PASSWORD" + echo "Database secret created directly in Kubernetes" + fi + echo "Database setup completed. DataHub will handle schema initialization and migrations." + +# Delete database secret +delete-database-secret: + @kubectl delete secret datahub-database-secret -n ${DATAHUB_NAMESPACE} --ignore-not-found + +# Create OAuth client in Keycloak for DataHub authentication +create-oauth-client: + #!/bin/bash + set -euo pipefail + if [ -z "${DATAHUB_HOST}" ]; then + echo "Error: DATAHUB_HOST environment variable is required" + exit 1 + fi + echo "Creating DataHub OAuth client in Keycloak..." + # Delete existing client to ensure fresh creation + echo "Removing existing client if present..." + just keycloak::delete-client ${KEYCLOAK_REALM} datahub || true + + CLIENT_SECRET=$(just utils::random-password) + just keycloak::create-client \ + ${KEYCLOAK_REALM} \ + datahub \ + "https://${DATAHUB_HOST}/callback/oidc" \ + "$CLIENT_SECRET" + + if helm status external-secrets -n ${EXTERNAL_SECRETS_NAMESPACE} &>/dev/null; then + echo "External Secrets available. Storing credentials in Vault and recreating ExternalSecret..." + just vault::put datahub/oauth \ + client_id=datahub \ + client_secret="$CLIENT_SECRET" + # Delete existing ExternalSecret to force recreation and refresh + kubectl delete externalsecret datahub-oauth-external-secret -n ${DATAHUB_NAMESPACE} --ignore-not-found + kubectl delete secret datahub-oauth-secret -n ${DATAHUB_NAMESPACE} --ignore-not-found + gomplate -f datahub-oauth-external-secret.gomplate.yaml -o datahub-oauth-external-secret.yaml + kubectl apply -f datahub-oauth-external-secret.yaml + echo "Waiting for OAuth secret to be ready..." + kubectl wait --for=condition=Ready externalsecret/datahub-oauth-external-secret \ + -n ${DATAHUB_NAMESPACE} --timeout=60s + else + echo "External Secrets not available. Creating Kubernetes Secret directly..." + kubectl delete secret datahub-oauth-secret -n ${DATAHUB_NAMESPACE} --ignore-not-found + kubectl create secret generic datahub-oauth-secret -n ${DATAHUB_NAMESPACE} \ + --from-literal=client_id=datahub \ + --from-literal=client_secret="$CLIENT_SECRET" + echo "OAuth secret created directly in Kubernetes" + fi + echo "OAuth client created successfully" + +# Delete OAuth secret +delete-oauth-secret: + @kubectl delete secret datahub-oauth-secret -n ${DATAHUB_NAMESPACE} --ignore-not-found + @kubectl delete externalsecret datahub-oauth-external-secret -n ${DATAHUB_NAMESPACE} --ignore-not-found + +# Install DataHub prerequisites +install-prerequisites: + #!/bin/bash + set -euo pipefail + echo "Installing DataHub prerequisites..." + just add-helm-repo + gomplate -f datahub-prerequisites-values.gomplate.yaml -o datahub-prerequisites-values.yaml + helm upgrade --install datahub-prerequisites datahub/datahub-prerequisites \ + --namespace ${DATAHUB_NAMESPACE} \ + --version ${DATAHUB_PREREQUISITES_CHART_VERSION} \ + -f datahub-prerequisites-values.yaml \ + --wait --timeout=10m + echo "Prerequisites installation completed" + +# Install DataHub main application +install-datahub: + #!/bin/bash + set -euo pipefail + echo "Installing DataHub main application..." + gomplate -f datahub-values.gomplate.yaml -o datahub-values.yaml + helm upgrade --install datahub datahub/datahub \ + --namespace ${DATAHUB_NAMESPACE} \ + --version ${DATAHUB_CHART_VERSION} \ + -f datahub-values.yaml \ + --wait --timeout=20m + echo "DataHub installation completed" + +# Install DataHub (full setup) +install: + #!/bin/bash + set -euo pipefail + export DATAHUB_HOST=${DATAHUB_HOST:-} + while [ -z "${DATAHUB_HOST}" ]; do + DATAHUB_HOST=$( + gum input --prompt="DataHub host (FQDN): " --width=100 \ + --placeholder="e.g., datahub.example.com" + ) + done + echo "Installing DataHub..." + just create-namespace + just setup-database + just create-oauth-client + just install-prerequisites + just install-datahub + echo "DataHub installation completed" + echo "Access DataHub at: https://${DATAHUB_HOST}" + +# Uninstall DataHub prerequisites +uninstall-prerequisites: + #!/bin/bash + set -euo pipefail + echo "Uninstalling DataHub prerequisites..." + helm uninstall datahub-prerequisites -n ${DATAHUB_NAMESPACE} --ignore-not-found + +# Uninstall DataHub main application +uninstall-datahub: + #!/bin/bash + set -euo pipefail + echo "Uninstalling DataHub main application..." + helm uninstall datahub -n ${DATAHUB_NAMESPACE} --ignore-not-found + +# Uninstall DataHub (complete removal) +uninstall delete-db='true': + #!/bin/bash + set -euo pipefail + echo "Uninstalling DataHub..." + just uninstall-datahub + just uninstall-prerequisites + just delete-oauth-secret + just delete-database-secret + just delete-namespace + if [ "{{ delete-db }}" = "true" ]; then + just postgres::delete-db datahub + fi + echo "DataHub uninstalled" + +# Clean up database and secrets +cleanup: + #!/bin/bash + set -euo pipefail + echo "This will delete the DataHub database and all secrets." + if gum confirm "Are you sure you want to proceed?"; then + echo "Cleaning up DataHub resources..." + just postgres::delete-db datahub || true + just vault::delete datahub/database || true + just vault::delete datahub/oauth || true + just keycloak::delete-client datahub || true + echo "Cleanup completed" + else + echo "Cleanup cancelled" + fi