Skip to content

Working with Files

Advanced file operations and integration patterns for data science workflows.

Notebook Tip: When working in Jupyter or Marimo notebooks, displaying a dataset (e.g., dataset) shows an interactive HTML view. Click "Copy Code to Access" on any file to get code snippets that automatically use your variable name!

File Access Patterns

The local_files() context manager is the recommended way to work with files:

from pathlib import Path
import pandas as pd
import polars as pl

# Work with files locally
with dataset.local_files() as local_files:
    for filename, local_path in local_files.items():
        print(f"{filename} -> {local_path}")

        # Process files with standard Python libraries
        if filename.endswith('.csv'):
            # Use pandas
            df = pd.read_csv(local_path)
            print(f"CSV shape: {df.shape}")

            # Or use polars
            df_polars = pl.read_csv(local_path)
            print(f"Polars shape: {df_polars.shape}")

        elif filename.endswith('.parquet'):
            # Read parquet files
            df = pd.read_parquet(local_path)
            print(f"Parquet shape: {df.shape}")

        elif filename.endswith('.json'):
            import json
            data = json.loads(Path(local_path).read_text())
            print(f"JSON keys: {list(data.keys())}")

Benefits:

  • Library compatibility: Works with pandas, polars, numpy, etc.
  • Automatic cleanup: Files cleaned up when done
  • Standard paths: Use normal Python file operations
  • Memory efficient: No need to load entire files into memory

Working with Data Science Libraries

Pandas Examples

import pandas as pd

with dataset.local_files() as local_files:
    # Read CSV files
    if "data.csv" in local_files:
        df = pd.read_csv(local_files["data.csv"])
        print(f"DataFrame shape: {df.shape}")

        # Process data
        df_processed = df.dropna().reset_index(drop=True)

        # Save processed data
        df_processed.to_csv("processed_data.csv")

        # Commit processed data
        dataset.commit(
            message="Add processed data",
            add_files=["processed_data.csv"]
        )

Polars Examples

import polars as pl

with dataset.local_files() as local_files:
    # Read CSV files with Polars
    if "data.csv" in local_files:
        df = pl.read_csv(local_files["data.csv"])
        print(f"Polars DataFrame shape: {df.shape}")

        # Process data with Polars
        df_processed = df.filter(pl.col("value").is_not_null())

        # Save processed data
        df_processed.write_csv("processed_data.csv")

        # Commit processed data
        dataset.commit(
            message="Add processed data",
            add_files=["processed_data.csv"]
        )

NumPy Examples

import numpy as np

with dataset.local_files() as local_files:
    # Read CSV files as NumPy arrays
    if "data.csv" in local_files:
        df = pd.read_csv(local_files["data.csv"])
        array = df.values
        print(f"NumPy array shape: {array.shape}")

        # Process with NumPy
        processed_array = np.nan_to_num(array)

        # Save processed array
        np.savetxt("processed_data.csv", processed_array, delimiter=",")

        # Commit processed data
        dataset.commit(
            message="Add processed array",
            add_files=["processed_data.csv"]
        )

Scikit-learn Examples

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import joblib

with dataset.local_files() as local_files:
    # Load training data
    if "train.csv" in local_files:
        df = pd.read_csv(local_files["train.csv"])
        X = df.drop("target", axis=1)
        y = df["target"]

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

        # Train model
        model = RandomForestClassifier(n_estimators=100)
        model.fit(X_train, y_train)

        # Save model
        joblib.dump(model, "model.pkl")

        # Save test data
        X_test.to_csv("X_test.csv", index=False)
        y_test.to_csv("y_test.csv", index=False)

        # Commit model and test data
        dataset.commit(
            message="Add trained model and test data",
            add_files=["model.pkl", "X_test.csv", "y_test.csv"]
        )

Best Practices

File Naming

Use descriptive, consistent file names:

# Good naming
dataset.commit("Add Q1 sales data", add_files=["sales_q1_2024.csv"])
dataset.commit("Add processed data", add_files=["sales_q1_2024_processed.csv"])
dataset.commit("Add model results", add_files=["model_results_q1_2024.json"])

# Avoid vague names
dataset.commit("Add data", add_files=["data.csv"])
dataset.commit("Update", add_files=["file1.csv", "file2.csv"])

File Organization

Organize files in logical directories:

# Good organization
dataset.commit("Add raw data", add_files=["raw/sales.csv", "raw/customers.csv"])
dataset.commit("Add processed data", add_files=["processed/sales_clean.csv"])
dataset.commit("Add analysis", add_files=["analysis/results.csv", "analysis/plots.png"])

# Avoid flat structure
dataset.commit("Add files", add_files=["sales.csv", "customers.csv",
                                        "results.csv", "plots.png"])

Next Steps