Commit Management
Understanding and working with Kirin's linear commit history.
Understanding Commits
What is a Commit?
A commit in Kirin represents an immutable snapshot of files at a point in time. Unlike Git, Kirin uses a linear commit history - each commit has exactly one parent commit, creating a simple chain of changes.
Commit A → Commit B → Commit C → Commit D
Commit Properties
Every commit has these key properties:
- Hash: Unique identifier (SHA256)
- Message: Human-readable description
- Timestamp: When the commit was created
- Parent: Reference to previous commit (linear history)
- Files: Dictionary of files in this commit
# Get commit information
commit = dataset.get_commit(commit_hash)
if commit:
print(f"Commit hash: {commit.hash}")
print(f"Short hash: {commit.short_hash}")
print(f"Message: {commit.message}")
print(f"Timestamp: {commit.timestamp}")
print(f"Parent: {commit.parent_hash}")
print(f"Files: {commit.list_files()}")
print(f"Total size: {commit.get_total_size()} bytes")
Creating Commits
Commit Method Parameters
The commit()
method accepts the following parameters:
- message (required): Human-readable description of the changes
- add_files (optional): List of file paths to add or update
- remove_files (optional): List of filenames to remove from the dataset
# Add new files
dataset.commit(message="Add new data", add_files=["data.csv", "metadata.json"])
# Remove files
dataset.commit(message="Remove old files", remove_files=["old_data.csv"])
# Add and remove files in the same commit
dataset.commit(
message="Update dataset",
add_files=["new_data.csv"],
remove_files=["old_data.csv"]
)
Error Handling
The commit()
method can raise the following exceptions:
- ValueError: If no changes are specified (both
add_files
andremove_files
are empty) - FileNotFoundError: If a file in
add_files
doesn't exist
try:
# This will raise ValueError
dataset.commit(message="No changes")
except ValueError as e:
print(f"Error: {e}")
try:
# This will raise FileNotFoundError if file doesn't exist
dataset.commit(message="Add file", add_files=["nonexistent.csv"])
except FileNotFoundError as e:
print(f"File not found: {e}")
Working with Commit History
Viewing History
# Get all commits
history = dataset.history()
for commit in history:
print(f"{commit.short_hash}: {commit.message}")
print(f" Date: {commit.timestamp}")
print(f" Files: {commit.list_files()}")
print()
# Get limited history
recent_commits = dataset.history(limit=5)
for commit in recent_commits:
print(f"{commit.short_hash}: {commit.message}")
Navigating History
# Checkout latest commit (default)
dataset.checkout()
# Checkout specific commit
dataset.checkout(commit_hash)
# Get current commit
current_commit = dataset.current_commit
if current_commit:
print(f"Current commit: {current_commit.short_hash}")
print(f"Message: {current_commit.message}")
Comparing Commits
def compare_commits(dataset, commit1_hash, commit2_hash):
"""Compare two commits to see what changed."""
commit1 = dataset.get_commit(commit1_hash)
commit2 = dataset.get_commit(commit2_hash)
if not commit1 or not commit2:
print("One or both commits not found")
return
files1 = set(commit1.list_files())
files2 = set(commit2.list_files())
added = files2 - files1
removed = files1 - files2
common = files1 & files2
print(f"Added files: {added}")
print(f"Removed files: {removed}")
print(f"Common files: {common}")
# Check if common files changed
for filename in common:
file1 = commit1.get_file(filename)
file2 = commit2.get_file(filename)
if file1.hash != file2.hash:
print(f"Changed: {filename}")
# Use the comparison function
compare_commits(dataset, "abc123", "def456")
Commit Workflows
Linear Development
Kirin's linear history is perfect for data science workflows:
# Initial data
dataset.commit(message="Add raw data", add_files=["raw_data.csv"])
# Data cleaning
dataset.commit(message="Clean data", add_files=["cleaned_data.csv"])
# Feature engineering
dataset.commit(message="Add features", add_files=["features.csv"])
# Model training
dataset.commit(message="Add trained model", add_files=["model.pkl"])
# Results
dataset.commit(message="Add results", add_files=["results.csv", "plots.png"])
Experiment Tracking
Track different experiments as separate commits:
# Experiment 1: Random Forest
dataset.commit(message="RF experiment", add_files=["rf_model.pkl", "rf_results.csv"])
# Experiment 2: Gradient Boosting
dataset.commit(message="GB experiment", add_files=["gb_model.pkl", "gb_results.csv"])
# Experiment 3: Neural Network
dataset.commit(message="NN experiment", add_files=["nn_model.pkl", "nn_results.csv"])
Data Pipeline Versioning
Version your data processing pipeline outputs:
# Raw data ingestion
dataset.commit(message="Ingest raw data", add_files=["raw/sales.csv", "raw/customers.csv"])
# Data validation
dataset.commit(message="Validate data", add_files=["validated/sales.csv", "validated/customers.csv"])
# Data transformation
dataset.commit(message="Transform data", add_files=["transformed/sales_clean.csv"])
# Feature engineering
dataset.commit(message="Create features", add_files=["features/engineered_features.csv"])
# Final output
dataset.commit(message="Final dataset", add_files=["final/dataset.csv"])
Advanced Commit Operations
Commit Information
def analyze_commit(commit):
"""Analyze a commit for detailed information."""
print(f"Commit: {commit.short_hash}")
print(f"Message: {commit.message}")
print(f"Date: {commit.timestamp}")
print(f"Parent: {commit.parent_hash}")
print(f"Files: {len(commit.files)}")
print(f"Total size: {commit.get_total_size()} bytes")
# File details
for filename, file_obj in commit.files.items():
print(f" {filename}: {file_obj.size} bytes ({file_obj.short_hash})")
# Analyze current commit
current_commit = dataset.current_commit
if current_commit:
analyze_commit(current_commit)
Commit Statistics
def commit_statistics(dataset):
"""Get statistics about the commit history."""
history = dataset.history()
if not history:
print("No commits found")
return
total_commits = len(history)
total_size = sum(commit.get_total_size() for commit in history)
avg_size = total_size / total_commits
print(f"Total commits: {total_commits}")
print(f"Total size: {total_size / (1024*1024):.1f} MB")
print(f"Average commit size: {avg_size / (1024*1024):.1f} MB")
# File frequency
file_counts = {}
for commit in history:
for filename in commit.list_files():
file_counts[filename] = file_counts.get(filename, 0) + 1
print(f"Most frequently changed files:")
for filename, count in sorted(file_counts.items(), key=lambda x: x[1], reverse=True)[:5]:
print(f" {filename}: {count} commits")
# Get statistics
commit_statistics(dataset)
Commit Best Practices
Commit Messages
Write clear, descriptive commit messages:
# Good commit messages
dataset.commit(message="Add Q1 2024 sales data", add_files=["sales_q1_2024.csv"])
dataset.commit(message="Fix data quality issues in customer records", add_files=["customers_cleaned.csv"])
dataset.commit(message="Add feature engineering for ML model", add_files=["features.csv"])
# Avoid vague messages
dataset.commit(message="Update", add_files=["data.csv"])
dataset.commit(message="Fix", add_files=["file.csv"])
dataset.commit(message="Add stuff", add_files=["file1.csv", "file2.csv"])
Commit Frequency
Commit changes regularly:
# Commit after each logical step
dataset.commit(message="Add raw data", add_files=["raw_data.csv"])
# ... process data ...
dataset.commit(message="Add cleaned data", add_files=["cleaned_data.csv"])
# ... analyze data ...
dataset.commit(message="Add analysis results", add_files=["results.csv"])
Atomic Commits
Make commits atomic (single logical change):
# Good: Single logical change
dataset.commit(message="Add customer data", add_files=["customers.csv"])
# Good: Related changes together
dataset.commit(message="Update customer data and add validation",
add_files=["customers_updated.csv", "validation_rules.json"])
# Avoid: Unrelated changes
dataset.commit(message="Add customer data and fix bug",
add_files=["customers.csv", "bug_fix.py"])
Working with Specific Commits
Accessing Files from Specific Commits
def get_files_from_commit(dataset, commit_hash):
"""Get files from a specific commit."""
commit = dataset.get_commit(commit_hash)
if not commit:
print("Commit not found")
return
# Checkout the commit
dataset.checkout(commit_hash)
# Access files
files = dataset.files
print(f"Files in commit {commit_hash}:")
for filename, file_obj in files.items():
print(f" {filename}: {file_obj.size} bytes")
return files
# Get files from specific commit
files = get_files_from_commit(dataset, "abc123")
Commit History Visualization
Commit Timeline
def create_timeline(dataset):
"""Create a timeline of commits."""
history = dataset.history()
print("Commit Timeline:")
print("=" * 30)
for commit in history:
date_str = commit.timestamp.strftime("%Y-%m-%d %H:%M")
print(f"{date_str} | {commit.short_hash} | {commit.message}")
# Create timeline
create_timeline(dataset)
Troubleshooting Commits
Finding Lost Commits
def find_commit_by_message(dataset, search_term):
"""Find commits by message content."""
history = dataset.history()
for commit in history:
if search_term.lower() in commit.message.lower():
print(f"Found: {commit.short_hash} - {commit.message}")
return commit
print(f"No commits found matching '{search_term}'")
return None
# Find commit
commit = find_commit_by_message(dataset, "model")
Recovering from Mistakes
def recover_from_mistake(dataset, good_commit_hash):
"""Recover from a mistake by checking out a good commit."""
# Checkout the good commit
dataset.checkout(good_commit_hash)
# Verify we're on the right commit
current_commit = dataset.current_commit
if current_commit and current_commit.hash == good_commit_hash:
print(f"Successfully recovered to commit {good_commit_hash}")
print(f"Current commit: {current_commit.message}")
else:
print("Failed to recover to specified commit")
# Recover from mistake
recover_from_mistake(dataset, "abc123")
Next Steps
- Working with Files - File operations and patterns
- Basic Usage - Core dataset operations
- Cloud Storage - Working with cloud backends