API Reference¶
Documentation for internal data APIs.
Data Pipeline API¶
PipelineRunner¶
class PipelineRunner:
"""
Main class for running data pipelines.
Attributes:
config (PipelineConfig): Pipeline configuration
logger (Logger): Pipeline logger
"""
def __init__(self, config: PipelineConfig):
"""
Initialize the pipeline runner.
Args:
config: Pipeline configuration object
"""
self.config = config
self.logger = self._setup_logger()
def run(self, dry_run: bool = False) -> PipelineResult:
"""
Execute the pipeline.
Args:
dry_run: If True, validate without executing
Returns:
PipelineResult with execution details
Raises:
PipelineError: If pipeline execution fails
"""
pass
def validate(self) -> ValidationResult:
"""
Validate pipeline configuration and dependencies.
Returns:
ValidationResult with any issues found
"""
pass
DataValidator Methods¶
Method |
Description |
Returns |
|---|---|---|
|
Check for null values |
|
|
Check for duplicate keys |
|
|
Validate against schema |
|
|
Check value ranges |
|
Usage Example¶
from pipeline import PipelineRunner, PipelineConfig
# Configure and run
config = PipelineConfig(
name="daily_sales",
source="postgresql://...",
target="s3://data-lake/sales/"
)
runner = PipelineRunner(config)
# Validate first
validation = runner.validate()
if validation.is_valid:
result = runner.run()
print(f"Pipeline completed: {result.rows_processed} rows")