fix(dagster): fix example summary error

This commit is contained in:
Masaki Yatsu
2025-09-16 09:31:04 +09:00
parent 42d74ff961
commit 686cea2022
2 changed files with 46 additions and 21 deletions

View File

@@ -104,25 +104,46 @@ def movielens_summary(
) -> MaterializeResult:
"""Generate summary of all loaded MovieLens data."""
# Get pipeline to access dlt info
pipeline = dlt.create_pipeline()
context.log.info("Generating summary of MovieLens dataset...")
# Get schema info
schema = pipeline.default_schema
tables = list(schema.tables.keys())
# Try to get schema from one of the existing pipelines
pipeline_names = ["movies", "ratings", "tags"]
schema_info = {}
tables_found = []
context.log.info(f"MovieLens dataset loaded with tables: {tables}")
for table_name in pipeline_names:
try:
# Create pipeline with the same name used in previous assets
pipeline = dlt.create_pipeline(table_name=table_name)
# Calculate basic metrics
table_count = len([t for t in tables if t in ["movies", "ratings", "tags"]])
# Try to get schema if it exists
if pipeline.default_schema_name in pipeline.schemas:
schema = pipeline.schemas[pipeline.default_schema_name]
context.log.info(
f"Found schema for pipeline '{pipeline.pipeline_name}'"
)
schema_info[table_name] = {
"pipeline": pipeline.pipeline_name,
"schema_version": schema.version,
}
tables_found.extend(
[t for t in schema.tables.keys() if t == table_name]
)
except Exception as e:
context.log.debug(f"Could not get schema for {table_name}: {e}")
context.log.info(
f"Summary: Found {len(tables_found)} tables from {len(schema_info)} pipelines"
)
return MaterializeResult(
metadata={
"pipeline_name": MetadataValue.text(dlt.pipeline_name),
"base_pipeline_name": MetadataValue.text(dlt.pipeline_name),
"dataset_name": MetadataValue.text(dlt.dataset_name),
"destination": MetadataValue.text(dlt.destination),
"schema_version": MetadataValue.int(schema.version if schema else 0),
"tables": MetadataValue.json(tables),
"movielens_tables": MetadataValue.int(table_count),
"pipelines_checked": MetadataValue.json(list(schema_info.keys())),
"tables_found": MetadataValue.json(tables_found),
"movielens_tables_count": MetadataValue.int(len(tables_found)),
"schema_info": MetadataValue.json(schema_info),
}
)

View File

@@ -28,17 +28,18 @@ class DltResource(ConfigurableResource):
# Enable detailed logging for dlt
os.environ["DLT_LOG_LEVEL"] = "INFO"
def create_pipeline(self):
"""Create dlt pipeline."""
import uuid
def create_pipeline(self, table_name: str):
"""Create dlt pipeline with optional table-specific name."""
self.setup_environment()
# Use a unique pipeline name to avoid conflicts
unique_pipeline_name = f"{self.pipeline_name}_{uuid.uuid4().hex[:8]}"
# Use table-specific pipeline name if provided, otherwise use base name
if table_name:
pipeline_name = f"{self.pipeline_name}_{table_name}"
else:
pipeline_name = self.pipeline_name
return dlt.pipeline(
pipeline_name=unique_pipeline_name,
pipeline_name=pipeline_name,
destination=self.destination,
dataset_name=self.dataset_name,
)
@@ -70,9 +71,12 @@ class DltResource(ConfigurableResource):
"""Run dlt pipeline with given resource data."""
logger = get_dagster_logger()
pipeline = self.create_pipeline()
# Create pipeline with table-specific name
pipeline = self.create_pipeline(table_name=table_name)
logger.info(f"Running pipeline for table {table_name}")
logger.info(
f"Running pipeline '{pipeline.pipeline_name}' for table {table_name}"
)
# Configure pipeline for progress tracking
pipeline.config.progress = "log" # Enables progress logging