Working with Files
Advanced file operations and integration patterns for data science workflows.
Notebook Tip: When working in Jupyter or Marimo notebooks, displaying a
dataset (e.g., dataset) shows an interactive HTML view. Click "Copy Code to
Access" on any file to get code snippets that automatically use your variable
name!
File Access Patterns
Local Files Context Manager (Recommended)
The local_files() context manager is the recommended way to work with files:
from pathlib import Path
import pandas as pd
import polars as pl
# Work with files locally
with dataset.local_files() as local_files:
for filename, local_path in local_files.items():
print(f"{filename} -> {local_path}")
# Process files with standard Python libraries
if filename.endswith('.csv'):
# Use pandas
df = pd.read_csv(local_path)
print(f"CSV shape: {df.shape}")
# Or use polars
df_polars = pl.read_csv(local_path)
print(f"Polars shape: {df_polars.shape}")
elif filename.endswith('.parquet'):
# Read parquet files
df = pd.read_parquet(local_path)
print(f"Parquet shape: {df.shape}")
elif filename.endswith('.json'):
import json
data = json.loads(Path(local_path).read_text())
print(f"JSON keys: {list(data.keys())}")
Benefits:
- Library compatibility: Works with pandas, polars, numpy, etc.
- Automatic cleanup: Files cleaned up when done
- Standard paths: Use normal Python file operations
- Memory efficient: No need to load entire files into memory
Working with Data Science Libraries
Pandas Examples
import pandas as pd
with dataset.local_files() as local_files:
# Read CSV files
if "data.csv" in local_files:
df = pd.read_csv(local_files["data.csv"])
print(f"DataFrame shape: {df.shape}")
# Process data
df_processed = df.dropna().reset_index(drop=True)
# Save processed data
df_processed.to_csv("processed_data.csv")
# Commit processed data
dataset.commit(
message="Add processed data",
add_files=["processed_data.csv"]
)
Polars Examples
import polars as pl
with dataset.local_files() as local_files:
# Read CSV files with Polars
if "data.csv" in local_files:
df = pl.read_csv(local_files["data.csv"])
print(f"Polars DataFrame shape: {df.shape}")
# Process data with Polars
df_processed = df.filter(pl.col("value").is_not_null())
# Save processed data
df_processed.write_csv("processed_data.csv")
# Commit processed data
dataset.commit(
message="Add processed data",
add_files=["processed_data.csv"]
)
NumPy Examples
import numpy as np
with dataset.local_files() as local_files:
# Read CSV files as NumPy arrays
if "data.csv" in local_files:
df = pd.read_csv(local_files["data.csv"])
array = df.values
print(f"NumPy array shape: {array.shape}")
# Process with NumPy
processed_array = np.nan_to_num(array)
# Save processed array
np.savetxt("processed_data.csv", processed_array, delimiter=",")
# Commit processed data
dataset.commit(
message="Add processed array",
add_files=["processed_data.csv"]
)
Scikit-learn Examples
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import joblib
with dataset.local_files() as local_files:
# Load training data
if "train.csv" in local_files:
df = pd.read_csv(local_files["train.csv"])
X = df.drop("target", axis=1)
y = df["target"]
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# Train model
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)
# Save model
joblib.dump(model, "model.pkl")
# Save test data
X_test.to_csv("X_test.csv", index=False)
y_test.to_csv("y_test.csv", index=False)
# Commit model and test data
dataset.commit(
message="Add trained model and test data",
add_files=["model.pkl", "X_test.csv", "y_test.csv"]
)
Best Practices
File Naming
Use descriptive, consistent file names:
# Good naming
dataset.commit("Add Q1 sales data", add_files=["sales_q1_2024.csv"])
dataset.commit("Add processed data", add_files=["sales_q1_2024_processed.csv"])
dataset.commit("Add model results", add_files=["model_results_q1_2024.json"])
# Avoid vague names
dataset.commit("Add data", add_files=["data.csv"])
dataset.commit("Update", add_files=["file1.csv", "file2.csv"])
File Organization
Organize files in logical directories:
# Good organization
dataset.commit("Add raw data", add_files=["raw/sales.csv", "raw/customers.csv"])
dataset.commit("Add processed data", add_files=["processed/sales_clean.csv"])
dataset.commit("Add analysis", add_files=["analysis/results.csv", "analysis/plots.png"])
# Avoid flat structure
dataset.commit("Add files", add_files=["sales.csv", "customers.csv",
"results.csv", "plots.png"])
Next Steps
- Commit Management - Understanding commit history
- Cloud Storage - Working with cloud backends
- Basic Usage - Core dataset operations