Data Analysis with Python

Published

February 2026

This page covers the core Python skills for data analysis in PHYS 4430: working with arrays, making plots, fitting data to models, spectral analysis, and propagating uncertainties.

1 Python Basics Refresher

If you’re new to Python or need a refresher, here are the essentials.

1.1 Variables and Data Types

# Numbers
voltage = 3.14159          # float (decimal)
num_samples = 100          # int (integer)

# Strings
filename = "data.csv"

# Lists (ordered, changeable)
measurements = [1.2, 1.3, 1.4, 1.5]

# Booleans
is_running = True

1.2 Functions

def calculate_energy(wavelength_nm):
    """
    Calculate photon energy from wavelength.

    Parameters:
        wavelength_nm: Wavelength in nanometers

    Returns:
        Energy in Joules
    """
    h = 6.626e-34  # Planck's constant (J·s)
    c = 3e8        # Speed of light (m/s)
    wavelength_m = wavelength_nm * 1e-9
    return (h * c) / wavelength_m

# Call the function
energy = calculate_energy(632.8)
print(f"Energy: {energy:.3e} J")

1.3 Loops

# For loop - iterate over a sequence
wavelengths = [400, 500, 600, 700]
for wl in wavelengths:
    print(f"Wavelength: {wl} nm")

# While loop - repeat until condition is false
count = 0
while count < 5:
    print(f"Count: {count}")
    count += 1

1.4 Importing Packages

# Import entire package
import numpy

# Import with alias (common convention)
import numpy as np
import matplotlib.pyplot as plt

# Import specific functions
from scipy.optimize import curve_fit

2 NumPy Essentials

NumPy is the foundation for numerical computing in Python. It provides fast array operations essential for scientific data.

2.1 Creating Arrays

import numpy as np

# From a list
data = np.array([1.2, 1.3, 1.4, 1.5, 1.6])

# Evenly spaced values
x = np.linspace(0, 10, 100)    # 100 points from 0 to 10
t = np.arange(0, 1, 0.01)      # 0 to 1 in steps of 0.01

# Arrays of zeros or ones
zeros = np.zeros(100)
ones = np.ones(50)

2.2 Array Operations

import numpy as np

x = np.array([1, 2, 3, 4, 5])
y = np.array([2, 4, 6, 8, 10])

# Element-wise operations
z = x + y           # [3, 6, 9, 12, 15]
z = x * y           # [2, 8, 18, 32, 50]
z = x ** 2          # [1, 4, 9, 16, 25]

# Mathematical functions
z = np.sin(x)
z = np.exp(x)
z = np.sqrt(x)

# Statistics
mean_val = np.mean(x)
std_val = np.std(x)
max_val = np.max(x)

2.3 Loading Data from CSV

import numpy as np

# Simple CSV with just numbers
data = np.loadtxt('data.csv', delimiter=',')

# CSV with header row
data = np.loadtxt('data.csv', delimiter=',', skiprows=1)

# Access columns
x = data[:, 0]  # First column
y = data[:, 1]  # Second column

2.4 Saving Data to CSV

import numpy as np

x = np.array([1, 2, 3, 4, 5])
y = np.array([1.1, 2.2, 3.3, 4.4, 5.5])

# Stack columns together
data = np.column_stack((x, y))

# Save with header
np.savetxt('output.csv', data, delimiter=',',
           header='x,y', comments='')

3 Plotting with Matplotlib

Matplotlib is the standard plotting library for Python.

3.1 Basic Line Plot

import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(0, 10, 100)
y = np.sin(x)

plt.figure(figsize=(10, 6))
plt.plot(x, y)
plt.xlabel('x')
plt.ylabel('sin(x)')
plt.title('Sine Wave')
plt.grid(True)
plt.show()

3.2 Scatter Plot

import matplotlib.pyplot as plt

x_data = [1, 2, 3, 4, 5]
y_data = [2.1, 3.9, 6.2, 7.8, 10.1]

plt.figure(figsize=(10, 6))
plt.scatter(x_data, y_data, marker='o', color='blue')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Scatter Plot')
plt.grid(True)
plt.show()

3.3 Error Bars

import matplotlib.pyplot as plt
import numpy as np

x = np.array([1, 2, 3, 4, 5])
y = np.array([2.1, 3.9, 6.2, 7.8, 10.1])
y_err = np.array([0.2, 0.3, 0.2, 0.4, 0.3])  # Uncertainties

plt.figure(figsize=(10, 6))
plt.errorbar(x, y, yerr=y_err, fmt='o', capsize=5,
             label='Data with uncertainties')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Data with Error Bars')
plt.legend()
plt.grid(True)
plt.show()

3.4 Multiple Plots and Legends

import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(0, 10, 100)

plt.figure(figsize=(10, 6))
plt.plot(x, np.sin(x), label='sin(x)')
plt.plot(x, np.cos(x), label='cos(x)')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Multiple Functions')
plt.legend()
plt.grid(True)
plt.show()

3.5 Subplots

import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(0, 10, 100)

fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8))

ax1.plot(x, np.sin(x))
ax1.set_ylabel('sin(x)')
ax1.set_title('Sine')
ax1.grid(True)

ax2.plot(x, np.cos(x))
ax2.set_xlabel('x')
ax2.set_ylabel('cos(x)')
ax2.set_title('Cosine')
ax2.grid(True)

plt.tight_layout()
plt.show()

3.6 Saving Figures

import matplotlib.pyplot as plt

# ... create your plot ...

# Save as PNG (high resolution)
plt.savefig('my_plot.png', dpi=300, bbox_inches='tight')

# Save as PDF (vector format, good for publications)
plt.savefig('my_plot.pdf', bbox_inches='tight')

4 Curve Fitting with SciPy

Fitting data to a model is one of the most common tasks in experimental physics.

4.1 Basic Curve Fitting

import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit

# Define the model function
def linear(x, m, b):
    """Linear function: y = mx + b"""
    return m * x + b

# Sample data
x_data = np.array([1, 2, 3, 4, 5])
y_data = np.array([2.1, 3.9, 6.2, 7.8, 10.1])

# Perform the fit
popt, pcov = curve_fit(linear, x_data, y_data)

# Extract fit parameters
m_fit, b_fit = popt
print(f"Slope: {m_fit:.3f}")
print(f"Intercept: {b_fit:.3f}")

# Plot data and fit
x_fit = np.linspace(0, 6, 100)
y_fit = linear(x_fit, m_fit, b_fit)

plt.figure(figsize=(10, 6))
plt.scatter(x_data, y_data, label='Data')
plt.plot(x_fit, y_fit, 'r-', label='Fit')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.grid(True)
plt.show()

4.2 Getting Uncertainties from the Fit

The covariance matrix pcov contains information about parameter uncertainties:

import numpy as np
from scipy.optimize import curve_fit

# ... perform fit to get popt, pcov ...

# Parameter uncertainties are square root of diagonal elements
perr = np.sqrt(np.diag(pcov))

m_fit, b_fit = popt
m_err, b_err = perr

print(f"Slope: {m_fit:.3f} +/- {m_err:.3f}")
print(f"Intercept: {b_fit:.3f} +/- {b_err:.3f}")

4.3 Nonlinear Fitting Example: Gaussian

import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit

# Gaussian function
def gaussian(x, amplitude, center, width):
    return amplitude * np.exp(-((x - center) ** 2) / (2 * width ** 2))

# Generate sample data (with noise)
x_data = np.linspace(-5, 5, 50)
y_true = gaussian(x_data, 10, 0, 1)
y_data = y_true + np.random.normal(0, 0.5, len(x_data))

# Initial guesses (important for nonlinear fits!)
p0 = [8, 0.5, 1.5]  # [amplitude, center, width]

# Perform fit
popt, pcov = curve_fit(gaussian, x_data, y_data, p0=p0)
perr = np.sqrt(np.diag(pcov))

print(f"Amplitude: {popt[0]:.2f} +/- {perr[0]:.2f}")
print(f"Center: {popt[1]:.2f} +/- {perr[1]:.2f}")
print(f"Width: {popt[2]:.2f} +/- {perr[2]:.2f}")

4.4 Weighted Fitting (Using Uncertainties)

When you have uncertainties on your data points, use weighted fitting:

import numpy as np
from scipy.optimize import curve_fit

x_data = np.array([1, 2, 3, 4, 5])
y_data = np.array([2.1, 3.9, 6.2, 7.8, 10.1])
y_err = np.array([0.2, 0.3, 0.2, 0.4, 0.3])  # Uncertainties

def linear(x, m, b):
    return m * x + b

# Use sigma parameter for weighted fit
# absolute_sigma=True means y_err are actual standard deviations
popt, pcov = curve_fit(linear, x_data, y_data,
                       sigma=y_err, absolute_sigma=True)

4.5 Plotting Residuals

Residuals help you assess the quality of your fit:

import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit

# ... perform fit ...

# Calculate residuals
y_fit = linear(x_data, *popt)
residuals = y_data - y_fit

# Create figure with two subplots
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8),
                               gridspec_kw={'height_ratios': [3, 1]})

# Top plot: data and fit
ax1.scatter(x_data, y_data, label='Data')
ax1.plot(x_fit, linear(x_fit, *popt), 'r-', label='Fit')
ax1.set_ylabel('y')
ax1.legend()
ax1.grid(True)

# Bottom plot: residuals
ax2.scatter(x_data, residuals)
ax2.axhline(y=0, color='r', linestyle='--')
ax2.set_xlabel('x')
ax2.set_ylabel('Residuals')
ax2.grid(True)

plt.tight_layout()
plt.show()

4.6 Calculating Chi-Squared

Chi-squared (\(\chi^2\)) quantifies the goodness of fit:

import numpy as np

def calculate_chi_squared(y_data, y_fit, y_err, num_params):
    """
    Calculate chi-squared and reduced chi-squared.

    Parameters:
        y_data: Measured data points
        y_fit: Fitted values at same x positions
        y_err: Uncertainties on data points
        num_params: Number of fit parameters

    Returns:
        chi2: Chi-squared value
        chi2_red: Reduced chi-squared (chi2 / degrees of freedom)
    """
    residuals = y_data - y_fit
    chi2 = np.sum((residuals / y_err) ** 2)
    dof = len(y_data) - num_params  # degrees of freedom
    chi2_red = chi2 / dof
    return chi2, chi2_red

# Example usage
chi2, chi2_red = calculate_chi_squared(y_data, y_fit, y_err, num_params=2)
print(f"Chi-squared: {chi2:.2f}")
print(f"Reduced chi-squared: {chi2_red:.2f}")
# For a good fit, reduced chi-squared should be close to 1

5 FFT and Spectral Analysis

Fourier analysis lets you identify frequency components in your data, useful for finding noise sources (like 60 Hz interference) and understanding signal characteristics.

5.1 Computing the FFT

import numpy as np

def compute_power_spectrum(signal, sample_rate):
    """
    Compute the one-sided power spectrum of a real-valued signal.

    Uses rfft/rfftfreq, which automatically return only the
    positive-frequency half for real input.

    Parameters:
        signal: 1D array of signal values
        sample_rate: Sampling rate in Hz

    Returns:
        frequencies: Array of positive frequency values (Hz)
        power: Array of power values
    """
    n = len(signal)
    frequencies = np.fft.rfftfreq(n, d=1 / sample_rate)
    fft_result = np.fft.rfft(signal)

    # Power spectrum (magnitude squared, normalized)
    power = (np.abs(fft_result) / n) ** 2

    # Double power for frequencies that appear twice (all except DC and Nyquist)
    power[1:-1] *= 2

    return frequencies, power

5.2 Basic FFT Example

import numpy as np
import matplotlib.pyplot as plt

# Generate a test signal: 50 Hz + 120 Hz + noise
sample_rate = 1000  # Hz
duration = 1.0      # seconds
n_samples = int(sample_rate * duration)

t = np.arange(n_samples) / sample_rate
signal = (np.sin(2 * np.pi * 50 * t) +
          0.5 * np.sin(2 * np.pi * 120 * t) +
          0.2 * np.random.randn(n_samples))

# Compute power spectrum
frequencies, power = compute_power_spectrum(signal, sample_rate)

# Plot time domain and frequency domain
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8))

ax1.plot(t * 1000, signal, 'b-', linewidth=0.5)
ax1.set_xlabel('Time (ms)')
ax1.set_ylabel('Amplitude')
ax1.set_title('Time Domain')
ax1.set_xlim(0, 100)  # Show first 100 ms
ax1.grid(True)

ax2.plot(frequencies, power, 'r-')
ax2.set_xlabel('Frequency (Hz)')
ax2.set_ylabel('Power')
ax2.set_title('Frequency Domain (Power Spectrum)')
ax2.set_xlim(0, 200)  # Show up to 200 Hz
ax2.grid(True)

plt.tight_layout()
plt.show()

5.3 Understanding FFT Parameters

Parameter	Formula	Meaning
Nyquist frequency	\(f_N = f_s / 2\)	Maximum frequency you can measure
Frequency resolution	\(\Delta f = f_s / N\)	Smallest frequency difference you can distinguish
Total duration	\(T = N / f_s\)	Longer duration = better frequency resolution

Example: With \(f_s = 1000\) Hz and \(N = 1000\) samples:

Nyquist frequency: 500 Hz
Frequency resolution: 1 Hz
Duration: 1 second

5.4 Aliasing

If your signal contains frequencies above the Nyquist frequency (\(f_s/2\)), they will “fold back” and appear at incorrect frequencies. This is called aliasing.

import numpy as np
import matplotlib.pyplot as plt

def demonstrate_aliasing(signal_freq, sample_rate):
    """Show how undersampling causes aliasing."""
    nyquist = sample_rate / 2

    print(f"Signal: {signal_freq} Hz")
    print(f"Sample rate: {sample_rate} Hz")
    print(f"Nyquist: {nyquist} Hz")

    if signal_freq > nyquist:
        aliased = abs(signal_freq - sample_rate * round(signal_freq / sample_rate))
        print(f"ALIASED to: {aliased} Hz")
    else:
        print("No aliasing (signal below Nyquist)")

# Examples
demonstrate_aliasing(50, 1000)   # OK: 50 Hz < 500 Hz
demonstrate_aliasing(800, 1000)  # Aliased: 800 Hz > 500 Hz, appears at 200 Hz

Rule of thumb: Sample at least 2x your highest frequency of interest (preferably 5-10x).

5.5 Identifying Noise Sources

Common frequencies to look for in your spectrum:

Frequency	Source
60 Hz (and harmonics: 120, 180…)	Power line interference
Low frequency drift	Temperature changes, mechanical vibration
High frequency noise	Electronic noise, quantization

For a deeper introduction to Fourier analysis including windowing, aliasing, and practice exercises, see Fourier Analysis.

6 Error Propagation

When you calculate a result from measured quantities, uncertainties propagate through the calculation.

6.1 Manual Error Propagation

For a function \(f(x, y)\) with independent uncertainties \(\sigma_x\) and \(\sigma_y\):

\[\sigma_f = \sqrt{\left(\frac{\partial f}{\partial x}\right)^2 \sigma_x^2 + \left(\frac{\partial f}{\partial y}\right)^2 \sigma_y^2}\]

Example: For \(f = x \cdot y\):

import numpy as np

x = 5.0
sigma_x = 0.1
y = 3.0
sigma_y = 0.2

f = x * y  # = 15.0

# Partial derivatives: df/dx = y, df/dy = x
sigma_f = np.sqrt((y * sigma_x)**2 + (x * sigma_y)**2)

print(f"f = {f:.2f} +/- {sigma_f:.2f}")

6.2 Using the Uncertainties Package

The uncertainties package handles error propagation automatically:

from uncertainties import ufloat
import uncertainties.umath as umath

# Define values with uncertainties
x = ufloat(5.0, 0.1)  # 5.0 +/- 0.1
y = ufloat(3.0, 0.2)  # 3.0 +/- 0.2

# Arithmetic propagates uncertainties automatically
f = x * y
print(f"x * y = {f}")  # 15.0+/-1.1

g = x + y
print(f"x + y = {g}")  # 8.0+/-0.22

h = x / y
print(f"x / y = {h}")  # 1.67+/-0.12

6.3 Math Functions with Uncertainties

from uncertainties import ufloat
import uncertainties.umath as umath

angle = ufloat(0.5, 0.01)  # radians

# Trig functions
sin_val = umath.sin(angle)
cos_val = umath.cos(angle)

print(f"sin({angle}) = {sin_val}")
print(f"cos({angle}) = {cos_val}")

# Exponential and log
value = ufloat(2.0, 0.1)
exp_val = umath.exp(value)
log_val = umath.log(value)

print(f"exp({value}) = {exp_val}")
print(f"log({value}) = {log_val}")

6.4 Working with Arrays

from uncertainties import ufloat, unumpy
import numpy as np

# Create arrays with uncertainties
values = [ufloat(1.0, 0.1), ufloat(2.0, 0.15), ufloat(3.0, 0.2)]

# Or from separate arrays
nominal = np.array([1.0, 2.0, 3.0])
errors = np.array([0.1, 0.15, 0.2])
values = unumpy.uarray(nominal, errors)

# Operations work element-wise
squared = values ** 2
print(squared)

# Extract nominal values and uncertainties
print(f"Nominal: {unumpy.nominal_values(squared)}")
print(f"Std dev: {unumpy.std_devs(squared)}")

6.5 Example: Beam Waist Calculation

from uncertainties import ufloat
import uncertainties.umath as umath

# Measured beam width at two positions
w1 = ufloat(0.50, 0.02)  # mm
w2 = ufloat(0.75, 0.03)  # mm
z = ufloat(100.0, 1.0)   # mm between measurements

wavelength = 632.8e-6  # mm (He-Ne laser)

# Calculate beam waist (simplified formula)
# w0 = sqrt(w1 * w2) for certain geometries
w0 = umath.sqrt(w1 * w2)

print(f"Beam waist: {w0} mm")
# Output includes propagated uncertainty

7 Common Troubleshooting

7.1 Import Errors

Problem: ModuleNotFoundError: No module named 'numpy'

Solution: Install the missing package:

pip install numpy

7.2 Plot Not Showing

Problem: plt.show() does nothing

Solutions:

In Jupyter: Add %matplotlib inline at the top of your notebook
In VS Code: Make sure you’re not running in a non-interactive environment
Try plt.savefig('plot.png') to save instead of display

7.3 Curve Fit Not Converging

Problem: OptimizeWarning: Covariance of the parameters could not be estimated

Solutions:

Provide better initial guesses (p0 parameter)
Check that your model function is appropriate for the data
Check for NaN or Inf values in your data
Try setting bounds on parameters

Back to Python Resources