feat(dagster): add Dagster

This commit is contained in:
Masaki Yatsu
2025-09-15 19:25:31 +09:00
parent c725124a7a
commit dbcbaedf6f
21 changed files with 4018 additions and 0 deletions

6
dagster/.gitignore vendored Normal file
View File

@@ -0,0 +1,6 @@
dagster-values.yaml
dagster-database-external-secret.yaml
dagster-minio-external-secret.yaml
dagster-oauth-external-secret.yaml
dagster-storage-pvc.yaml
dagster-user-code-pvc.yaml

View File

@@ -0,0 +1,26 @@
apiVersion: external-secrets.io/v1
kind: ExternalSecret
metadata:
name: dagster-database-external-secret
namespace: {{ .Env.DAGSTER_NAMESPACE }}
spec:
refreshInterval: 1h
secretStoreRef:
name: vault-secret-store
kind: ClusterSecretStore
target:
name: dagster-database-secret
creationPolicy: Owner
data:
- secretKey: username
remoteRef:
key: dagster/database
property: username
- secretKey: password
remoteRef:
key: dagster/database
property: password
- secretKey: postgresql-password
remoteRef:
key: dagster/database
property: password

View File

@@ -0,0 +1,22 @@
apiVersion: external-secrets.io/v1
kind: ExternalSecret
metadata:
name: dagster-minio-external-secret
namespace: {{ .Env.DAGSTER_NAMESPACE }}
spec:
refreshInterval: 1h
secretStoreRef:
name: vault-secret-store
kind: ClusterSecretStore
target:
name: dagster-minio-secret
creationPolicy: Owner
data:
- secretKey: access_key
remoteRef:
key: dagster/minio
property: access_key
- secretKey: secret_key
remoteRef:
key: dagster/minio
property: secret_key

View File

@@ -0,0 +1,22 @@
apiVersion: external-secrets.io/v1
kind: ExternalSecret
metadata:
name: dagster-oauth-external-secret
namespace: {{ .Env.DAGSTER_NAMESPACE }}
spec:
refreshInterval: 1h
secretStoreRef:
name: vault-secret-store
kind: ClusterSecretStore
target:
name: dagster-oauth-secret
creationPolicy: Owner
data:
- secretKey: client_id
remoteRef:
key: dagster/oauth
property: client_id
- secretKey: client_secret
remoteRef:
key: dagster/oauth
property: client_secret

View File

@@ -0,0 +1,18 @@
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: dagster-storage-pvc
namespace: {{ .Env.DAGSTER_NAMESPACE }}
spec:
accessModes:
{{- if eq .Env.STORAGE_CLASS "longhorn" }}
- ReadWriteMany # Longhorn supports RWX
{{- else }}
- ReadWriteOnce # Default storage class typically supports RWO
{{- end }}
{{- if .Env.STORAGE_CLASS }}
storageClassName: {{ .Env.STORAGE_CLASS }}
{{- end }}
resources:
requests:
storage: {{ .Env.DAGSTER_STORAGE_SIZE }}

View File

@@ -0,0 +1,19 @@
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: dagster-user-code-pvc
namespace: {{ .Env.DAGSTER_NAMESPACE }}
spec:
{{- if .Env.LONGHORN_AVAILABLE }}
accessModes:
- ReadWriteMany # Longhorn supports RWX
{{- else }}
accessModes:
- ReadWriteOnce # Fallback to RWO
{{- end }}
resources:
requests:
storage: {{ .Env.DAGSTER_CODE_STORAGE_SIZE }}
{{- if .Env.LONGHORN_AVAILABLE }}
storageClassName: longhorn
{{- end }}

View File

@@ -0,0 +1,148 @@
# Dagster Helm Chart Values
# Configuration for Dagster deployment
global:
serviceAccountName: "dagster"
postgresqlSecretName: "dagster-database-secret"
# Disable automatic PostgreSQL secret generation
generatePostgresqlPasswordSecret: false
dagsterWebserver:
replicaCount: 1
image:
repository: "{{ .Env.DAGSTER_CONTAINER_IMAGE }}"
tag: "{{ .Env.DAGSTER_CONTAINER_TAG }}"
pullPolicy: "{{ .Env.DAGSTER_CONTAINER_PULL_POLICY }}"
service:
type: ClusterIP
port: 80
resources:
requests:
memory: "256Mi"
cpu: "100m"
limits:
memory: "1Gi"
cpu: "1000m"
env:
- name: DAGSTER_HOME
value: /opt/dagster/dagster_home
- name: PYTHONPATH
value: /opt/dagster/user-code
- name: PIP_USER
value: "true"
volumeMounts:
- name: user-code
mountPath: /opt/dagster/user-code
volumes:
- name: user-code
persistentVolumeClaim:
claimName: dagster-user-code-pvc
workspace:
enabled: true
servers: []
dagsterDaemon:
enabled: true
image:
repository: "{{ .Env.DAGSTER_CONTAINER_IMAGE }}"
tag: "{{ .Env.DAGSTER_CONTAINER_TAG }}"
pullPolicy: "{{ .Env.DAGSTER_CONTAINER_PULL_POLICY }}"
resources:
requests:
memory: "256Mi"
cpu: "100m"
limits:
memory: "1Gi"
cpu: "1000m"
volumeMounts:
- name: user-code
mountPath: /opt/dagster/user-code
volumes:
- name: user-code
persistentVolumeClaim:
claimName: dagster-user-code-pvc
env:
- name: DAGSTER_HOME
value: /opt/dagster/dagster_home
- name: PYTHONPATH
value: /opt/dagster/user-code
- name: PIP_USER
value: "true"
runLauncher:
type: K8sRunLauncher
config:
k8sRunLauncher:
image: "{{ .Env.DAGSTER_CONTAINER_IMAGE }}:{{ .Env.DAGSTER_CONTAINER_TAG }}"
imagePullPolicy: "{{ .Env.DAGSTER_CONTAINER_PULL_POLICY }}"
jobNamespace: "{{ .Env.DAGSTER_NAMESPACE }}"
loadInclusterConfig: true
volumeMounts:
- name: user-code
mountPath: /opt/dagster/user-code
volumes:
- name: user-code
persistentVolumeClaim:
claimName: dagster-user-code-pvc
{{- if eq (.Env.DAGSTER_STORAGE_TYPE | default "local") "minio" }}
envSecrets:
- name: dagster-database-secret
- name: dagster-minio-secret
{{- else }}
envSecrets:
- name: dagster-database-secret
{{- end }}
postgresql:
enabled: false
postgresqlHost: "postgres-cluster-rw.postgres.svc.cluster.local"
postgresqlUsername: "dagster"
postgresqlPassword: ""
postgresqlDatabase: "dagster"
service:
port: 5432
userDeployments:
enabled: false
dagster-user-deployments:
enabled: true
enableSubchart: false
deployments: []
{{- if eq (.Env.DAGSTER_STORAGE_TYPE | default "local") "minio" }}
computeLogManager:
type: S3ComputeLogManager
config:
s3ComputeLogManager:
bucket: "dagster-logs"
region: "us-east-1"
endpointUrl: "http://minio.{{ .Env.MINIO_NAMESPACE }}.svc.cluster.local:9000"
useSSL: false
secretName: "dagster-minio-secret"
{{- else }}
computeLogManager:
type: NoOpComputeLogManager
{{- end }}
dagsterHome: "/opt/dagster/dagster_home"
serviceAccount:
create: true
name: "dagster"
rbac:
create: true

View File

@@ -0,0 +1,205 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[codz]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py.cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# UV
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
#uv.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
#poetry.toml
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
#pdm.lock
#pdm.toml
.pdm-python
.pdm-build/
# pixi
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
#pixi.lock
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
# in the .venv directory. It is recommended not to include this directory in version control.
.pixi
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.envrc
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
# Abstra
# Abstra is an AI-powered process automation framework.
# Ignore directories containing user credentials, local state, and settings.
# Learn more at https://abstra.io/docs
.abstra/
# Visual Studio Code
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
# and can be added to the global gitignore or merged into this file. However, if you prefer,
# you could uncomment the following to ignore the entire vscode folder
# .vscode/
# Ruff stuff:
.ruff_cache/
# PyPI configuration file
.pypirc
# Marimo
marimo/_static/
marimo/_lsp/
__marimo__/
# Streamlit
.streamlit/secrets.toml
.tmp*

View File

@@ -0,0 +1,61 @@
# dagster_tutorial
## Getting started
### Installing dependencies
**Option 1: uv**
Ensure [`uv`](https://docs.astral.sh/uv/) is installed following their [official documentation](https://docs.astral.sh/uv/getting-started/installation/).
Create a virtual environment, and install the required dependencies using _sync_:
```bash
uv sync
```
Then, activate the virtual environment:
| OS | Command |
| --- | --- |
| MacOS | ```source .venv/bin/activate``` |
| Windows | ```.venv\Scripts\activate``` |
**Option 2: pip**
Install the python dependencies with [pip](https://pypi.org/project/pip/):
```bash
python3 -m venv .venv
```
Then active the virtual environment:
| OS | Command |
| --- | --- |
| MacOS | ```source .venv/bin/activate``` |
| Windows | ```.venv\Scripts\activate``` |
Install the required dependencies:
```bash
pip install -e ".[dev]"
```
### Running Dagster
Start the Dagster UI web server:
```bash
dg dev
```
Open http://localhost:3000 in your browser to see the project.
## Learn more
To learn more about this template and Dagster in general:
- [Dagster Documentation](https://docs.dagster.io/)
- [Dagster University](https://courses.dagster.io/)
- [Dagster Slack Community](https://dagster.io/slack)

View File

@@ -0,0 +1,32 @@
[project]
name = "dagster_tutorial"
requires-python = ">=3.9,<3.14"
version = "0.1.0"
dependencies = [
"dagster==1.11.10",
"dagster-duckdb>=0.27.10",
]
[dependency-groups]
dev = [
"dagster-webserver",
"dagster-dg-cli",
]
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[tool.hatch.build.targets.wheel]
force-include = { "pyproject.toml" = "pyproject.toml" }
[tool.dg]
directory_type = "project"
[tool.dg.project]
root_module = "dagster_tutorial"
registry_modules = [
"dagster_tutorial.components.*",
]

View File

@@ -0,0 +1,8 @@
from pathlib import Path
from dagster import definitions, load_from_defs_folder
@definitions
def defs():
return load_from_defs_folder(path_within_project=Path(__file__).parent)

View File

@@ -0,0 +1,96 @@
import dagster as dg
from dagster_duckdb import DuckDBResource
@dg.asset
def customers(duckdb: DuckDBResource):
url = "https://raw.githubusercontent.com/dbt-labs/jaffle-shop-classic/refs/heads/main/seeds/raw_customers.csv"
table_name = "customers"
with duckdb.get_connection() as conn:
conn.execute(
f"""
create or replace table {table_name} as (
select * from read_csv_auto('{url}')
)
"""
)
@dg.asset
def orders(duckdb: DuckDBResource):
url = "https://raw.githubusercontent.com/dbt-labs/jaffle-shop-classic/refs/heads/main/seeds/raw_orders.csv"
table_name = "orders"
with duckdb.get_connection() as conn:
conn.execute(
f"""
create or replace table {table_name} as (
select * from read_csv_auto('{url}')
)
"""
)
@dg.asset
def payments(duckdb: DuckDBResource):
url = "https://raw.githubusercontent.com/dbt-labs/jaffle-shop-classic/refs/heads/main/seeds/raw_payments.csv"
table_name = "payments"
with duckdb.get_connection() as conn:
conn.execute(
f"""
create or replace table {table_name} as (
select * from read_csv_auto('{url}')
)
"""
)
@dg.asset(
deps=["customers", "orders", "payments"],
)
def orders_aggregation(duckdb: DuckDBResource):
table_name = "orders_aggregation"
with duckdb.get_connection() as conn:
conn.execute(
f"""
create or replace table {table_name} as (
select
c.id as customer_id,
c.first_name,
c.last_name,
count(distinct o.id) as total_orders,
count(distinct p.id) as total_payments,
coalesce(sum(p.amount), 0) as total_amount_spent
from customers c
left join orders o
on c.id = o.user_id
left join payments p
on o.id = p.order_id
group by 1, 2, 3
);
"""
)
@dg.asset_check(asset="orders_aggregation")
def orders_aggregation_check(duckdb: DuckDBResource) -> dg.AssetCheckResult:
table_name = "orders_aggregation"
with duckdb.get_connection() as conn:
res = conn.execute(f"select count(*) from {table_name}").fetchone()
if res is None:
return dg.AssetCheckResult(
passed=False, metadata={"message": "Order aggregation check failed"}
)
row_count = res[0]
if row_count == 0:
return dg.AssetCheckResult(
passed=False, metadata={"message": "Order aggregation check failed"}
)
return dg.AssetCheckResult(
passed=True, metadata={"message": "Order aggregation check passed"}
)

View File

@@ -0,0 +1,13 @@
import dagster as dg
from dagster_duckdb import DuckDBResource
database_resource = DuckDBResource(database="/tmp/jaffle_platform.duckdb")
@dg.definitions
def resources():
return dg.Definitions(
resources={
"duckdb": database_resource,
}
)

View File

@@ -0,0 +1,17 @@
from typing import Union
import dagster as dg
# @dg.schedule(cron_schedule="@daily", target="*")
# def schedules(context: dg.ScheduleEvaluationContext) -> Union[dg.RunRequest, dg.SkipReason]:
# return dg.SkipReason("Skipping. Change this to return a RunRequest to launch a run.")
@dg.schedule(cron_schedule="* * * * *", target="*")
def tutorial_schedule(
context: dg.ScheduleEvaluationContext,
) -> Union[dg.RunRequest, dg.SkipReason]:
return dg.SkipReason(
"Skipping. Change this to return a RunRequest to launch a run."
)

View File

@@ -0,0 +1 @@

2699
dagster/examples/dagster_tutorial/uv.lock generated Normal file

File diff suppressed because it is too large Load Diff

621
dagster/justfile Normal file
View File

@@ -0,0 +1,621 @@
set fallback := true
export DAGSTER_NAMESPACE := env("DAGSTER_NAMESPACE", "dagster")
export DAGSTER_CHART_VERSION := env("DAGSTER_CHART_VERSION", "1.11.10")
export DAGSTER_CONTAINER_IMAGE := env("DAGSTER_CONTAINER_IMAGE", "docker.io/dagster/dagster-k8s")
export DAGSTER_CONTAINER_TAG := env("DAGSTER_CONTAINER_TAG", "1.11.10")
export DAGSTER_CONTAINER_PULL_POLICY := env("DAGSTER_CONTAINER_PULL_POLICY", "IfNotPresent")
export DAGSTER_HOST := env("DAGSTER_HOST", "")
export EXTERNAL_SECRETS_NAMESPACE := env("EXTERNAL_SECRETS_NAMESPACE", "external-secrets")
export KEYCLOAK_REALM := env("KEYCLOAK_REALM", "buunstack")
export DAGSTER_STORAGE_SIZE := env("DAGSTER_STORAGE_SIZE", "20Gi")
export DAGSTER_CODE_STORAGE_SIZE := env("DAGSTER_CODE_STORAGE_SIZE", "10Gi")
export MINIO_NAMESPACE := env("MINIO_NAMESPACE", "minio")
export DAGSTER_STORAGE_TYPE := env("DAGSTER_STORAGE_TYPE", "")
[private]
default:
@just --list --unsorted --list-submodules
# Add Helm repository
add-helm-repo:
helm repo add dagster https://dagster-io.github.io/helm
helm repo update
# Remove Helm repository
remove-helm-repo:
helm repo remove dagster
# Create Dagster namespace
create-namespace:
@kubectl get namespace ${DAGSTER_NAMESPACE} &>/dev/null || \
kubectl create namespace ${DAGSTER_NAMESPACE}
# Delete Dagster namespace
delete-namespace:
@kubectl delete namespace ${DAGSTER_NAMESPACE} --ignore-not-found
# Setup database for Dagster
setup-database:
#!/bin/bash
set -euo pipefail
echo "Setting up Dagster database..."
if just postgres::db-exists dagster &>/dev/null; then
echo "Database 'dagster' already exists. Dagster will handle schema migrations."
else
echo "Creating new database 'dagster'..."
just postgres::create-db dagster
fi
# Generate password for user creation/update
# For existing users, preserve existing password if possible
if just postgres::user-exists dagster &>/dev/null; then
echo "User 'dagster' already exists."
# Check if we can get existing password from Vault/Secret
if helm status external-secrets -n ${EXTERNAL_SECRETS_NAMESPACE} &>/dev/null; then
# Try to get existing password from Vault
if DB_PASSWORD=$(just vault::get dagster/database password 2>/dev/null); then
echo "Using existing password from Vault."
else
echo "Generating new password and updating Vault..."
DB_PASSWORD=$(just utils::random-password)
just postgres::psql -c "ALTER USER dagster WITH PASSWORD '$DB_PASSWORD';"
fi
else
# For direct Secret approach, generate new password
echo "Generating new password for existing user..."
DB_PASSWORD=$(just utils::random-password)
just postgres::psql -c "ALTER USER dagster WITH PASSWORD '$DB_PASSWORD';"
fi
else
echo "Creating new user 'dagster'..."
DB_PASSWORD=$(just utils::random-password)
just postgres::create-user dagster "$DB_PASSWORD"
fi
echo "Ensuring database permissions..."
just postgres::grant dagster dagster
if helm status external-secrets -n ${EXTERNAL_SECRETS_NAMESPACE} &>/dev/null; then
echo "External Secrets available. Storing credentials in Vault and creating ExternalSecret..."
just vault::put dagster/database username=dagster password="$DB_PASSWORD"
gomplate -f dagster-database-external-secret.gomplate.yaml -o dagster-database-external-secret.yaml
kubectl apply -f dagster-database-external-secret.yaml
echo "Waiting for database secret to be ready..."
kubectl wait --for=condition=Ready externalsecret/dagster-database-external-secret \
-n ${DAGSTER_NAMESPACE} --timeout=60s
else
echo "External Secrets not available. Creating Kubernetes Secret directly..."
kubectl delete secret dagster-database-secret -n ${DAGSTER_NAMESPACE} --ignore-not-found
kubectl create secret generic dagster-database-secret -n ${DAGSTER_NAMESPACE} \
--from-literal=username=dagster \
--from-literal=password="$DB_PASSWORD"
echo "Database secret created directly in Kubernetes"
fi
echo "Database setup completed. Dagster will handle schema initialization and migrations."
# Delete database secret
delete-database-secret:
@kubectl delete secret dagster-database-secret -n ${DAGSTER_NAMESPACE} --ignore-not-found
# Create OAuth client in Keycloak for Dagster authentication
create-oauth-client:
#!/bin/bash
set -euo pipefail
if [ -z "${DAGSTER_HOST}" ]; then
echo "Error: DAGSTER_HOST environment variable is required"
exit 1
fi
echo "Creating Dagster OAuth client in Keycloak..."
# Delete existing client to ensure fresh creation
echo "Removing existing client if present..."
just keycloak::delete-client ${KEYCLOAK_REALM} dagster || true
# Create confidential client for oauth2-proxy
CLIENT_SECRET=$(just utils::random-password)
just keycloak::create-client \
${KEYCLOAK_REALM} \
dagster \
"https://${DAGSTER_HOST}/oauth2/callback" \
"$CLIENT_SECRET"
if helm status external-secrets -n ${EXTERNAL_SECRETS_NAMESPACE} &>/dev/null; then
echo "External Secrets available. Storing credentials in Vault and recreating ExternalSecret..."
just vault::put dagster/oauth \
client_id=dagster \
client_secret="$CLIENT_SECRET"
# Delete existing ExternalSecret to force recreation and refresh
kubectl delete externalsecret dagster-oauth-external-secret -n ${DAGSTER_NAMESPACE} --ignore-not-found
kubectl delete secret dagster-oauth-secret -n ${DAGSTER_NAMESPACE} --ignore-not-found
gomplate -f dagster-oauth-external-secret.gomplate.yaml -o dagster-oauth-external-secret.yaml
kubectl apply -f dagster-oauth-external-secret.yaml
echo "Waiting for OAuth secret to be ready..."
kubectl wait --for=condition=Ready externalsecret/dagster-oauth-external-secret \
-n ${DAGSTER_NAMESPACE} --timeout=60s
else
echo "External Secrets not available. Creating Kubernetes Secret directly..."
kubectl delete secret dagster-oauth-secret -n ${DAGSTER_NAMESPACE} --ignore-not-found
kubectl create secret generic dagster-oauth-secret -n ${DAGSTER_NAMESPACE} \
--from-literal=client_id=dagster \
--from-literal=client_secret="$CLIENT_SECRET"
echo "OAuth secret created directly in Kubernetes"
fi
echo "OAuth client created successfully"
# Delete OAuth secret
delete-oauth-secret:
@kubectl delete secret dagster-oauth-secret -n ${DAGSTER_NAMESPACE} --ignore-not-found
@kubectl delete externalsecret dagster-oauth-external-secret -n ${DAGSTER_NAMESPACE} --ignore-not-found
# Setup MinIO storage for Dagster
setup-minio-storage:
#!/bin/bash
set -euo pipefail
echo "Setting up MinIO storage for Dagster..."
# Check if MinIO is available
if ! kubectl get service minio -n minio &>/dev/null; then
echo "Error: MinIO is not installed. Please install MinIO first with 'just minio::install'"
exit 1
fi
# Create MinIO user and bucket for Dagster
# Default buckets: dagster-data (for data files), dagster-logs (for compute logs)
just minio::create-user dagster "dagster-data"
just minio::create-bucket dagster-logs
# Note: minio::create-user already grants readwrite policy to the user
if helm status external-secrets -n ${EXTERNAL_SECRETS_NAMESPACE} &>/dev/null; then
echo "Creating ExternalSecret for MinIO credentials..."
gomplate -f dagster-minio-external-secret.gomplate.yaml -o dagster-minio-external-secret.yaml
kubectl apply -f dagster-minio-external-secret.yaml
echo "Waiting for MinIO secret to be ready..."
kubectl wait --for=condition=Ready externalsecret/dagster-minio-external-secret \
-n ${DAGSTER_NAMESPACE} --timeout=60s
else
echo "External Secrets not available. Creating Kubernetes Secret directly..."
# Get credentials from Vault (stored by minio::create-user)
ACCESS_KEY=dagster
SECRET_KEY=$(just vault::get dagster/minio secret_key 2>/dev/null || echo "")
if [ -z "$SECRET_KEY" ]; then
echo "Error: Could not retrieve MinIO credentials. Please check Vault."
exit 1
fi
kubectl delete secret dagster-minio-secret -n ${DAGSTER_NAMESPACE} --ignore-not-found
kubectl create secret generic dagster-minio-secret -n ${DAGSTER_NAMESPACE} \
--from-literal=access_key="$ACCESS_KEY" \
--from-literal=secret_key="$SECRET_KEY" \
--from-literal=data_bucket="dagster-data" \
--from-literal=logs_bucket="dagster-logs" \
--from-literal=endpoint="http://minio.minio.svc.cluster.local:9000"
echo "MinIO secret created directly in Kubernetes"
fi
echo "MinIO storage setup completed"
# Delete MinIO secret
delete-minio-secret:
@kubectl delete secret dagster-minio-secret -n ${DAGSTER_NAMESPACE} --ignore-not-found
@kubectl delete externalsecret dagster-minio-external-secret -n ${DAGSTER_NAMESPACE} --ignore-not-found
# Setup PVC storage for Dagster
setup-pvc-storage:
#!/bin/bash
set -euo pipefail
echo "Setting up PVC storage for Dagster..."
# Detect storage class
export STORAGE_CLASS=""
if kubectl get storageclass longhorn &>/dev/null && \
kubectl get pods -n longhorn-system &>/dev/null | grep -q longhorn-manager; then
echo "Longhorn detected - using longhorn storage class"
export STORAGE_CLASS="longhorn"
else
echo "Using default storage class"
fi
# Create PVC for Dagster storage if it doesn't exist
if ! kubectl get pvc dagster-storage-pvc -n ${DAGSTER_NAMESPACE} &>/dev/null; then
echo "Creating PersistentVolumeClaim for Dagster storage..."
gomplate -f dagster-storage-pvc.gomplate.yaml -o dagster-storage-pvc.yaml
kubectl apply -f dagster-storage-pvc.yaml
echo "Waiting for PVC to be bound..."
# Wait for PVC to be bound
for i in {1..90}; do
STATUS=$(kubectl get pvc dagster-storage-pvc -n ${DAGSTER_NAMESPACE} -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound")
if [ "$STATUS" = "Bound" ]; then
echo "PVC bound successfully"
break
elif [ $i -eq 90 ]; then
echo "Timeout waiting for PVC to bind"
exit 1
fi
echo "Waiting for PVC to bind... (${i}/90) Status: ${STATUS}"
sleep 2
done
else
echo "PVC already exists"
fi
echo "PVC storage setup completed"
# Setup shared PVC for user code (supports ReadWriteMany with Longhorn)
setup-user-code-pvc:
#!/bin/bash
set -euo pipefail
echo "Setting up shared PVC for user code..."
# Detect if Longhorn is available (same as Airbyte)
export LONGHORN_AVAILABLE="false"
if kubectl get storageclass longhorn &>/dev/null && \
kubectl get pods -n longhorn-system &>/dev/null | grep -q longhorn-manager; then
echo "Longhorn detected - using ReadWriteMany with longhorn storage class"
export LONGHORN_AVAILABLE="true"
else
echo "Longhorn not detected - using ReadWriteOnce"
export LONGHORN_AVAILABLE="false"
fi
# Create PVC for user code if it doesn't exist
if ! kubectl get pvc dagster-user-code-pvc -n ${DAGSTER_NAMESPACE} &>/dev/null; then
echo "Creating PersistentVolumeClaim for user code..."
gomplate -f dagster-user-code-pvc.gomplate.yaml -o dagster-user-code-pvc.yaml
kubectl apply -f dagster-user-code-pvc.yaml
echo "Waiting for user code PVC to be bound..."
# Wait for PVC to be bound
for i in {1..90}; do
STATUS=$(kubectl get pvc dagster-user-code-pvc -n ${DAGSTER_NAMESPACE} -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound")
if [ "$STATUS" = "Bound" ]; then
echo "User code PVC bound successfully"
break
elif [ $i -eq 90 ]; then
echo "Timeout waiting for user code PVC to bind"
exit 1
fi
echo "Waiting for user code PVC to bind... (${i}/90) Status: ${STATUS}"
sleep 2
done
# Display PVC info
ACCESS_MODE=$(
kubectl get pvc dagster-user-code-pvc -n ${DAGSTER_NAMESPACE} \
-o jsonpath='{.spec.accessModes[0]}'
)
STORAGE_CLASS=$(
kubectl get pvc dagster-user-code-pvc -n ${DAGSTER_NAMESPACE} \
-o jsonpath='{.spec.storageClassName}'
)
echo "User code PVC created with access mode: $ACCESS_MODE, storage class: ${STORAGE_CLASS:-default}"
else
echo "User code PVC already exists"
fi
echo "User code PVC setup completed"
# Delete PVC storage
delete-pvc-storage:
@kubectl delete pvc dagster-storage-pvc -n ${DAGSTER_NAMESPACE} --ignore-not-found
@kubectl delete pvc dagster-user-code-pvc -n ${DAGSTER_NAMESPACE} --ignore-not-found
# Add a Python module to workspace.yaml
add-workspace-module module_name working_directory:
#!/bin/bash
set -euo pipefail
MODULE_NAME="{{ module_name }}"
WORKING_DIR="{{ working_directory }}"
echo "Adding module '${MODULE_NAME}' to workspace..."
# Get current workspace.yaml from ConfigMap
CURRENT_WORKSPACE=$(kubectl get configmap dagster-workspace-yaml -n ${DAGSTER_NAMESPACE} -o jsonpath='{.data.workspace\.yaml}')
# Create temporary file with current content
echo "$CURRENT_WORKSPACE" > /tmp/current_workspace.yaml
# Check if module already exists
if echo "$CURRENT_WORKSPACE" | grep -q "module_name: ${MODULE_NAME}"; then
echo "Module '${MODULE_NAME}' already exists in workspace - skipping workspace update"
echo "✓ Project files updated successfully"
exit 0
fi
# Create new workspace entry with proper escaping
cat > /tmp/new_entry.txt << EOF
- python_module:
module_name: ${MODULE_NAME}
working_directory: ${WORKING_DIR}
EOF
# Add to workspace
if echo "$CURRENT_WORKSPACE" | grep -q "load_from: \[\]"; then
# Replace empty array with new entry
NEW_WORKSPACE=$(echo "$CURRENT_WORKSPACE" | sed 's/load_from: \[\]/load_from:/')
NEW_WORKSPACE="${NEW_WORKSPACE}"$'\n'"$(cat /tmp/new_entry.txt)"
else
# Append to existing entries
NEW_WORKSPACE="${CURRENT_WORKSPACE}"$'\n'"$(cat /tmp/new_entry.txt)"
fi
# Update ConfigMap using jq with proper key escaping
PATCH_JSON=$(jq -n --arg workspace "$NEW_WORKSPACE" '{"data": {"workspace.yaml": $workspace}}')
kubectl patch configmap dagster-workspace-yaml -n ${DAGSTER_NAMESPACE} --patch "$PATCH_JSON"
echo "✓ Module '${MODULE_NAME}' added to workspace"
echo "Restarting Dagster to reload workspace..."
kubectl rollout restart deployment/dagster-dagster-webserver -n ${DAGSTER_NAMESPACE}
kubectl rollout restart deployment/dagster-daemon -n ${DAGSTER_NAMESPACE}
# Note: add-workspace-file command has been removed due to sed parsing issues
# Use add-workspace-module command instead for adding Python modules to workspace
# Deploy a project to shared PVC
[no-cd]
deploy-project project_dir='':
#!/bin/bash
set -euo pipefail
PROJECT_DIR="{{ project_dir }}"
# Interactive input if not provided
while [ -z "${PROJECT_DIR}" ]; do
PROJECT_DIR=$(gum input --prompt="Project directory path: " --width=100 \
--placeholder="e.g., ./my_project or /path/to/project")
done
# Check if directory exists first
if [ ! -d "${PROJECT_DIR}" ]; then
echo "Error: Project directory '${PROJECT_DIR}' not found"
echo "Please provide a valid project directory path"
exit 1
fi
# Convert to absolute path
PROJECT_DIR=$(realpath "${PROJECT_DIR}")
PROJECT_NAME=$(basename "${PROJECT_DIR}")
# Validate project name - no hyphens allowed
if echo "${PROJECT_NAME}" | grep -q '-'; then
echo "Error: Project directory name '${PROJECT_NAME}' contains hyphens"
echo "Please rename the directory to use underscores instead of hyphens"
echo "Example: '${PROJECT_NAME}' -> '$(echo "${PROJECT_NAME}" | tr '-' '_')'"
exit 1
fi
# Project name is also the Python module name (no conversion needed)
PYTHON_MODULE_NAME="${PROJECT_NAME}"
echo "Using project directory: ${PROJECT_DIR}"
echo "Project name: ${PROJECT_NAME}"
echo "Python module name: ${PYTHON_MODULE_NAME}"
# Check if user code PVC exists
if ! kubectl get pvc dagster-user-code-pvc -n ${DAGSTER_NAMESPACE} &>/dev/null; then
echo "Error: User code PVC not found. Run 'just dagster::setup-user-code-pvc' first."
exit 1
fi
# Check if Longhorn is available for ReadWriteMany support
if kubectl get storageclass longhorn &>/dev/null; then
echo "Longhorn detected - PVC supports ReadWriteMany for sharing with other services"
else
echo "Longhorn not detected - PVC will use ReadWriteOnce (Dagster-only access)"
fi
echo "Deploying project '${PROJECT_NAME}'..."
# Find running Dagster webserver pod
DAGSTER_POD=$(kubectl get pods -n ${DAGSTER_NAMESPACE} -l component=dagster-webserver -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
if [ -z "$DAGSTER_POD" ] || ! kubectl get pod "$DAGSTER_POD" -n ${DAGSTER_NAMESPACE} &>/dev/null; then
echo "Error: No running Dagster webserver pod found"
echo "Please ensure Dagster is installed and running first"
exit 1
fi
echo "Using Dagster webserver pod: $DAGSTER_POD"
# Create directory if it doesn't exist
kubectl exec "$DAGSTER_POD" -n ${DAGSTER_NAMESPACE} -- mkdir -p "/opt/dagster/user-code/${PROJECT_NAME}" 2>/dev/null || true
# Copy project files
echo "Copying project files to shared PVC..."
kubectl cp "${PROJECT_DIR}/." "${DAGSTER_NAMESPACE}/${DAGSTER_POD}:/opt/dagster/user-code/${PROJECT_NAME}/"
# Determine the correct working directory (check if src directory exists)
WORKING_DIR="/opt/dagster/user-code/${PROJECT_NAME}"
if kubectl exec "$DAGSTER_POD" -n ${DAGSTER_NAMESPACE} -- test -d "/opt/dagster/user-code/${PROJECT_NAME}/src" 2>/dev/null; then
WORKING_DIR="/opt/dagster/user-code/${PROJECT_NAME}/src"
echo "Found src directory, using: ${WORKING_DIR}"
else
echo "Using project root: ${WORKING_DIR}"
fi
# Add to workspace (use definitions submodule)
just dagster::add-workspace-module "${PYTHON_MODULE_NAME}.definitions" "${WORKING_DIR}"
echo "✓ Project '${PROJECT_NAME}' deployed successfully"
echo "Files location: /opt/dagster/user-code/${PROJECT_NAME}"
# Remove a project from shared PVC
[no-cd]
remove-project project_name='':
#!/bin/bash
set -euo pipefail
PROJECT_NAME="{{ project_name }}"
# Interactive input if not provided
while [ -z "${PROJECT_NAME}" ]; do
PROJECT_NAME=$(gum input --prompt="Project name to remove: " --width=100 \
--placeholder="e.g., dagster-tutorial")
done
# Confirmation prompt
if ! gum confirm "Are you sure you want to remove project '${PROJECT_NAME}'?"; then
echo "Cancelled"
exit 0
fi
# Validate project name - no hyphens allowed
if echo "${PROJECT_NAME}" | grep -q '-'; then
echo "Error: Project name '${PROJECT_NAME}' contains hyphens"
echo "Project names with hyphens are not supported"
exit 1
fi
# Project name is also the Python module name
PYTHON_MODULE_NAME="${PROJECT_NAME}"
echo "Removing project '${PROJECT_NAME}' (module: ${PYTHON_MODULE_NAME})..."
# Find running Dagster webserver pod
DAGSTER_POD=$(kubectl get pods -n ${DAGSTER_NAMESPACE} -l component=dagster-webserver -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
if [ -z "$DAGSTER_POD" ] || ! kubectl get pod "$DAGSTER_POD" -n ${DAGSTER_NAMESPACE} &>/dev/null; then
echo "Error: No running Dagster webserver pod found"
echo "Please ensure Dagster is installed and running first"
exit 1
fi
# Remove project files from PVC
echo "Removing project files from shared PVC..."
kubectl exec "$DAGSTER_POD" -n ${DAGSTER_NAMESPACE} -- rm -rf "/opt/dagster/user-code/${PROJECT_NAME}" 2>/dev/null || true
# Remove from workspace.yaml
echo "Removing module '${PYTHON_MODULE_NAME}' from workspace..."
# Get current workspace.yaml from ConfigMap
CURRENT_WORKSPACE=$(kubectl get configmap dagster-workspace-yaml -n ${DAGSTER_NAMESPACE} -o jsonpath='{.data.workspace\.yaml}')
# Check if module exists
if ! echo "$CURRENT_WORKSPACE" | grep -q "module_name: ${PYTHON_MODULE_NAME}"; then
echo "Module '${PYTHON_MODULE_NAME}' not found in workspace - only removing files"
else
# Remove the module entry using sed (remove the python_module block)
NEW_WORKSPACE=$(echo "$CURRENT_WORKSPACE" | sed "/- python_module:/,/working_directory: .*/{/module_name: ${PYTHON_MODULE_NAME}/,/working_directory: .*/d;}")
# If no modules left, reset to empty array
if ! echo "$NEW_WORKSPACE" | grep -q "module_name:"; then
NEW_WORKSPACE="load_from: []"$'\n'
fi
# Update ConfigMap using jq
PATCH_JSON=$(jq -n --arg workspace "$NEW_WORKSPACE" '{"data": {"workspace.yaml": $workspace}}')
kubectl patch configmap dagster-workspace-yaml -n ${DAGSTER_NAMESPACE} --patch "$PATCH_JSON"
echo "✓ Module '${PYTHON_MODULE_NAME}' removed from workspace"
fi
# Restart Dagster to reload workspace
echo "Restarting Dagster to reload workspace..."
kubectl rollout restart deployment/dagster-dagster-webserver -n ${DAGSTER_NAMESPACE}
kubectl rollout restart deployment/dagster-daemon -n ${DAGSTER_NAMESPACE}
echo "✓ Project '${PROJECT_NAME}' removed successfully"
# Setup OAuth2 Proxy for Dagster authentication
setup-oauth2-proxy:
#!/bin/bash
set -euo pipefail
export DAGSTER_HOST=${DAGSTER_HOST:-}
while [ -z "${DAGSTER_HOST}" ]; do
DAGSTER_HOST=$(
gum input --prompt="Dagster host (FQDN): " --width=100 \
--placeholder="e.g., dagster.example.com"
)
done
echo "Setting up OAuth2 Proxy for Dagster..."
just oauth2-proxy::setup-for-app dagster "${DAGSTER_HOST}" "${DAGSTER_NAMESPACE}" "dagster-dagster-webserver:80"
echo "OAuth2 Proxy setup completed"
# Install OAuth2 Proxy for Dagster authentication
install-oauth2-proxy:
just setup-oauth2-proxy
# Remove OAuth2 Proxy
remove-oauth2-proxy:
just oauth2-proxy::remove-for-app dagster ${DAGSTER_NAMESPACE}
# Install Dagster (full setup)
install:
#!/bin/bash
set -euo pipefail
export DAGSTER_HOST=${DAGSTER_HOST:-}
while [ -z "${DAGSTER_HOST}" ]; do
DAGSTER_HOST=$(
gum input --prompt="Dagster host (FQDN): " --width=100 \
--placeholder="e.g., dagster.example.com"
)
done
if [ -z "${DAGSTER_STORAGE_TYPE:-}" ]; then
DAGSTER_STORAGE_TYPE=$(gum choose --header="Select storage type:" "local" "minio")
fi
echo "Selected storage type: ${DAGSTER_STORAGE_TYPE}"
echo "Installing Dagster..."
just create-namespace
just setup-database
just create-oauth-client
if [ "${DAGSTER_STORAGE_TYPE}" = "minio" ]; then
if kubectl get namespace minio &>/dev/null; then
echo "MinIO detected. Setting up MinIO storage..."
just setup-minio-storage
else
echo "Error: MinIO namespace not found. Please install MinIO first."
exit 1
fi
else
echo "Setting up local PVC storage..."
just setup-pvc-storage
fi
just setup-user-code-pvc
just add-helm-repo
gomplate -f dagster-values.gomplate.yaml -o dagster-values.yaml
helm upgrade --install dagster dagster/dagster \
--namespace ${DAGSTER_NAMESPACE} \
--version ${DAGSTER_CHART_VERSION} \
-f dagster-values.yaml \
--wait --timeout=10m
if gum confirm "Set up Keycloak authentication with OAuth2 proxy?"; then
export DAGSTER_HOST="${DAGSTER_HOST}"
just setup-oauth2-proxy
else
echo "Access Dagster at: https://${DAGSTER_HOST}"
echo "Post-installation notes:"
echo " • Run 'just setup-oauth2-proxy' later to enable Keycloak authentication"
fi
# Uninstall Dagster (complete removal)
uninstall delete-db='true':
#!/bin/bash
set -euo pipefail
echo "Uninstalling Dagster..."
just remove-oauth2-proxy
helm uninstall dagster -n ${DAGSTER_NAMESPACE} --ignore-not-found
just delete-oauth-secret
just delete-database-secret
just delete-minio-secret
just delete-pvc-storage
just delete-namespace
if [ "{{ delete-db }}" = "true" ]; then
just postgres::delete-db dagster
fi
# Clean up Keycloak client
just keycloak::delete-client ${KEYCLOAK_REALM} dagster || true
echo "Dagster uninstalled"
# Clean up database and secrets
cleanup:
#!/bin/bash
set -euo pipefail
echo "This will delete the Dagster database and all secrets."
if gum confirm "Are you sure you want to proceed?"; then
echo "Cleaning up Dagster resources..."
just postgres::delete-db dagster || true
just vault::delete dagster/database || true
just vault::delete dagster/oauth || true
just vault::delete dagster/minio || true
just keycloak::delete-client ${KEYCLOAK_REALM} dagster || true
echo "Cleanup completed"
else
echo "Cleanup cancelled"
fi

View File

@@ -10,6 +10,7 @@ mod airbyte
mod airflow mod airflow
mod ch-ui mod ch-ui
mod clickhouse mod clickhouse
mod dagster
mod datahub mod datahub
mod env mod env
mod external-secrets mod external-secrets

View File

@@ -7,4 +7,5 @@ k3sup = "0.13.10"
kubelogin = "1.34.0" kubelogin = "1.34.0"
node = "22.18.0" node = "22.18.0"
python = "3.12.11" python = "3.12.11"
uv = "0.8.7"
vault = "1.20.2" vault = "1.20.2"