eBPF (extended Berkeley Packet Filter) represents a paradigm shift in Linux system observability, enabling unprecedented visibility into kernel and application behavior without modifying kernel source code or loading kernel modules. This comprehensive guide demonstrates implementing eBPF-based observability solutions on AlmaLinux for performance monitoring, security analysis, and troubleshooting.
Understanding eBPF Architecture
eBPF is a revolutionary technology that allows running sandboxed programs in privileged kernel context:
- In-kernel virtual machine: Safe execution of custom code
- JIT compilation: Near-native performance
- Verification: Ensures program safety before execution
- Event-driven: Attach to various kernel and user space events
- Maps: Efficient data sharing between kernel and user space
Key Use Cases
- Performance Monitoring: CPU, memory, I/O profiling
- Network Analysis: Packet filtering, load balancing
- Security: Runtime threat detection and prevention
- Tracing: Application and system behavior analysis
- Observability: Custom metrics and insights
Prerequisites
Before implementing eBPF observability:
- AlmaLinux 9 with kernel 5.4+ (eBPF support)
- Root or sudo access
- Development tools installed
- Basic understanding of Linux kernel concepts
- Python 3.8+ for BCC tools
Setting Up eBPF Development Environment
Installing Required Packages
# Enable EPEL repository
sudo dnf install -y epel-release
# Install development tools
sudo dnf groupinstall -y "Development Tools"
# Install kernel headers and development packages
sudo dnf install -y kernel-devel kernel-headers
# Install LLVM and Clang for eBPF compilation
sudo dnf install -y llvm clang
# Install additional dependencies
sudo dnf install -y \
bpftool \
libbpf-devel \
elfutils-libelf-devel \
perf \
python3-devel \
python3-pip
# Install BCC (BPF Compiler Collection)
sudo dnf install -y bcc-tools python3-bcc
# Install bpftrace
sudo dnf install -y bpftrace
# Verify installations
bpftool version
bpftrace --version
python3 -c "import bcc; print('BCC version:', bcc.__version__)"
Configuring Kernel Parameters
# Enable eBPF JIT compiler for better performance
echo 'net.core.bpf_jit_enable=1' | sudo tee -a /etc/sysctl.conf
echo 'net.core.bpf_jit_harden=0' | sudo tee -a /etc/sysctl.conf
# Increase eBPF memory limits
echo 'kernel.unprivileged_bpf_disabled=0' | sudo tee -a /etc/sysctl.conf
echo 'kernel.bpf_stats_enabled=1' | sudo tee -a /etc/sysctl.conf
# Apply sysctl changes
sudo sysctl -p
# Verify eBPF support
ls /sys/kernel/debug/tracing/events/
Using BCC Tools for System Observability
CPU Performance Analysis
#!/usr/bin/env python3
# cpu_profile.py - CPU profiling with eBPF
from bcc import BPF
from time import sleep
import signal
import sys
# eBPF program
bpf_text = """
#include <uapi/linux/ptrace.h>
#include <linux/sched.h>
struct key_t {
u32 pid;
u64 kernel_ip;
u64 user_ip;
char name[TASK_COMM_LEN];
};
BPF_HASH(counts, struct key_t);
BPF_STACK_TRACE(stack_traces, 16384);
int do_perf_event(struct bpf_perf_event_data *ctx) {
u32 pid = bpf_get_current_pid_tgid() >> 32;
// Skip kernel threads
if (pid == 0)
return 0;
struct key_t key = {};
key.pid = pid;
// Get process name
bpf_get_current_comm(&key.name, sizeof(key.name));
// Get kernel and user stack IDs
key.kernel_ip = PT_REGS_IP(&ctx->regs);
key.user_ip = PT_REGS_SP(&ctx->regs);
counts.increment(key);
return 0;
}
"""
# Load BPF program
b = BPF(text=bpf_text)
b.attach_perf_event(ev_type=BPF.PERF_TYPE_SOFTWARE,
ev_config=BPF.PERF_COUNT_SW_CPU_CLOCK,
fn_name="do_perf_event",
sample_freq=99)
print("Sampling CPU usage... Press Ctrl-C to stop")
# Signal handler
def signal_handler(sig, frame):
print("\n\nTop CPU consumers:")
counts = b.get_table("counts")
for k, v in sorted(counts.items(), key=lambda x: x[1].value, reverse=True)[:20]:
print(f"PID: {k.pid:6d} Process: {k.name.decode('utf-8', 'replace'):16s} "
f"Samples: {v.value:8d}")
sys.exit(0)
signal.signal(signal.SIGINT, signal_handler)
signal.pause()
Memory Allocation Tracking
#!/usr/bin/env python3
# memory_tracker.py - Track memory allocations
from bcc import BPF
import time
# eBPF program
bpf_text = """
#include <uapi/linux/ptrace.h>
struct alloc_info_t {
u64 size;
u64 timestamp_ns;
int stack_id;
};
BPF_HASH(allocs, u64, struct alloc_info_t);
BPF_HASH(memstats, u32, u64);
BPF_STACK_TRACE(stack_traces, 16384);
int malloc_enter(struct pt_regs *ctx, size_t size) {
u64 pid_tgid = bpf_get_current_pid_tgid();
u32 pid = pid_tgid >> 32;
u64 ts = bpf_ktime_get_ns();
struct alloc_info_t info = {};
info.size = size;
info.timestamp_ns = ts;
info.stack_id = stack_traces.get_stackid(ctx, BPF_F_USER_STACK);
allocs.update(&pid_tgid, &info);
// Update statistics
u64 *total = memstats.lookup(&pid);
if (total) {
(*total) += size;
} else {
memstats.update(&pid, &size);
}
return 0;
}
int malloc_return(struct pt_regs *ctx) {
u64 pid_tgid = bpf_get_current_pid_tgid();
struct alloc_info_t *info = allocs.lookup(&pid_tgid);
if (info == 0)
return 0;
u64 addr = PT_REGS_RC(ctx);
if (addr != 0) {
allocs.delete(&pid_tgid);
}
return 0;
}
"""
# Load and attach
b = BPF(text=bpf_text)
b.attach_uprobe(name="c", sym="malloc", fn_name="malloc_enter")
b.attach_uretprobe(name="c", sym="malloc", fn_name="malloc_return")
print("Tracking memory allocations... Press Ctrl-C to stop")
while True:
try:
time.sleep(1)
print("\nTop memory consumers by PID:")
memstats = b.get_table("memstats")
for k, v in sorted(memstats.items(),
key=lambda x: x[1].value,
reverse=True)[:10]:
print(f"PID {k.value}: {v.value / 1024 / 1024:.2f} MB")
except KeyboardInterrupt:
break
Network Observability with eBPF
TCP Connection Monitoring
#!/usr/bin/env python3
# tcp_monitor.py - Monitor TCP connections
from bcc import BPF
import socket
import struct
# eBPF program
bpf_text = """
#include <uapi/linux/ptrace.h>
#include <net/sock.h>
#include <bcc/proto.h>
struct tcp_event_t {
u32 pid;
u32 saddr;
u32 daddr;
u16 sport;
u16 dport;
u64 ts_ns;
char comm[TASK_COMM_LEN];
u32 netns;
u8 type; // 0: connect, 1: accept, 2: close
};
BPF_PERF_OUTPUT(tcp_events);
BPF_HASH(currsock, u32, struct sock *);
int trace_connect_entry(struct pt_regs *ctx, struct sock *sk) {
u32 pid = bpf_get_current_pid_tgid() >> 32;
currsock.update(&pid, &sk);
return 0;
}
int trace_connect_return(struct pt_regs *ctx) {
int ret = PT_REGS_RC(ctx);
if (ret != 0)
return 0;
u32 pid = bpf_get_current_pid_tgid() >> 32;
struct sock **skpp = currsock.lookup(&pid);
if (skpp == 0)
return 0;
struct sock *sk = *skpp;
struct tcp_event_t event = {};
event.pid = pid;
event.ts_ns = bpf_ktime_get_ns();
event.type = 0; // connect
// Get connection details
bpf_probe_read_kernel(&event.saddr, sizeof(event.saddr), &sk->__sk_common.skc_rcv_saddr);
bpf_probe_read_kernel(&event.daddr, sizeof(event.daddr), &sk->__sk_common.skc_daddr);
bpf_probe_read_kernel(&event.sport, sizeof(event.sport), &sk->__sk_common.skc_num);
bpf_probe_read_kernel(&event.dport, sizeof(event.dport), &sk->__sk_common.skc_dport);
// Get process name
bpf_get_current_comm(&event.comm, sizeof(event.comm));
// Get network namespace
struct net *net;
bpf_probe_read_kernel(&net, sizeof(net), &sk->__sk_common.skc_net.net);
bpf_probe_read_kernel(&event.netns, sizeof(event.netns), &net->ns.inum);
event.dport = ntohs(event.dport);
tcp_events.perf_submit(ctx, &event, sizeof(event));
currsock.delete(&pid);
return 0;
}
"""
# Load BPF program
b = BPF(text=bpf_text)
b.attach_kprobe(event="tcp_v4_connect", fn_name="trace_connect_entry")
b.attach_kretprobe(event="tcp_v4_connect", fn_name="trace_connect_return")
# Process events
def print_tcp_event(cpu, data, size):
event = b["tcp_events"].event(data)
saddr = socket.inet_ntoa(struct.pack("I", event.saddr))
daddr = socket.inet_ntoa(struct.pack("I", event.daddr))
event_type = ["CONNECT", "ACCEPT", "CLOSE"][event.type]
print(f"{event.ts_ns / 1e9:.6f} {event.comm.decode('utf-8', 'replace'):16s} "
f"PID:{event.pid:6d} {event_type:8s} "
f"{saddr}:{event.sport} -> {daddr}:{event.dport} "
f"netns:{event.netns}")
# Attach callback
b["tcp_events"].open_perf_buffer(print_tcp_event)
print("Monitoring TCP connections... Press Ctrl-C to stop")
print("TIME(s) COMM PID TYPE CONNECTION")
while True:
try:
b.perf_buffer_poll()
except KeyboardInterrupt:
break
Security Monitoring with eBPF
File Access Monitoring
#!/usr/bin/env python3
# file_monitor.py - Monitor file access with security context
from bcc import BPF
import pwd
import grp
# eBPF program
bpf_text = """
#include <uapi/linux/ptrace.h>
#include <linux/fs.h>
struct file_event_t {
u32 pid;
u32 uid;
u32 gid;
char comm[TASK_COMM_LEN];
char filename[256];
int flags;
int mode;
};
BPF_PERF_OUTPUT(file_events);
int trace_open(struct pt_regs *ctx, const char __user *filename, int flags, int mode) {
struct file_event_t event = {};
u64 pid_tgid = bpf_get_current_pid_tgid();
event.pid = pid_tgid >> 32;
u64 uid_gid = bpf_get_current_uid_gid();
event.uid = uid_gid & 0xFFFFFFFF;
event.gid = uid_gid >> 32;
bpf_get_current_comm(&event.comm, sizeof(event.comm));
bpf_probe_read_user(&event.filename, sizeof(event.filename), filename);
event.flags = flags;
event.mode = mode;
file_events.perf_submit(ctx, &event, sizeof(event));
return 0;
}
"""
# Load BPF program
b = BPF(text=bpf_text)
b.attach_kprobe(event="do_sys_openat2", fn_name="trace_open")
# Flag descriptions
FLAG_NAMES = {
0x0: "O_RDONLY",
0x1: "O_WRONLY",
0x2: "O_RDWR",
0x40: "O_CREAT",
0x200: "O_TRUNC",
0x400: "O_APPEND",
}
def get_username(uid):
try:
return pwd.getpwuid(uid).pw_name
except:
return str(uid)
def get_groupname(gid):
try:
return grp.getgrgid(gid).gr_name
except:
return str(gid)
def decode_flags(flags):
flag_strs = []
for val, name in FLAG_NAMES.items():
if flags & val:
flag_strs.append(name)
return "|".join(flag_strs) if flag_strs else "0x{:x}".format(flags)
# Process events
def print_file_event(cpu, data, size):
event = b["file_events"].event(data)
username = get_username(event.uid)
groupname = get_groupname(event.gid)
flags_str = decode_flags(event.flags)
# Security checks
security_alerts = []
if "/etc/passwd" in event.filename.decode('utf-8', 'replace'):
security_alerts.append("PASSWD_ACCESS")
if "/etc/shadow" in event.filename.decode('utf-8', 'replace'):
security_alerts.append("SHADOW_ACCESS")
if event.uid == 0 and event.flags & 0x1: # Root write
security_alerts.append("ROOT_WRITE")
alert_str = " [" + ",".join(security_alerts) + "]" if security_alerts else ""
print(f"{event.comm.decode('utf-8', 'replace'):16s} "
f"PID:{event.pid:6d} UID:{username:8s} GID:{groupname:8s} "
f"FLAGS:{flags_str:20s} "
f"FILE:{event.filename.decode('utf-8', 'replace'):40s}"
f"{alert_str}")
# Attach callback
b["file_events"].open_perf_buffer(print_file_event)
print("Monitoring file access... Press Ctrl-C to stop")
print("COMM PID UID GID FLAGS FILE")
while True:
try:
b.perf_buffer_poll()
except KeyboardInterrupt:
break
Custom eBPF Programs
System Call Latency Monitoring
// syscall_latency.c - Monitor system call latency
#include <linux/bpf.h>
#include <linux/ptrace.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
struct syscall_data {
u64 start_ns;
u64 syscall_nr;
};
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(max_entries, 10240);
__type(key, u32);
__type(value, struct syscall_data);
} syscall_start SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_HISTOGRAM);
__uint(max_entries, 256);
__type(key, u64);
__type(value, u64);
} syscall_latency SEC(".maps");
SEC("tracepoint/raw_syscalls/sys_enter")
int trace_syscall_enter(struct trace_event_raw_sys_enter *ctx)
{
u32 pid = bpf_get_current_pid_tgid() >> 32;
struct syscall_data data = {};
data.start_ns = bpf_ktime_get_ns();
data.syscall_nr = ctx->id;
bpf_map_update_elem(&syscall_start, &pid, &data, BPF_ANY);
return 0;
}
SEC("tracepoint/raw_syscalls/sys_exit")
int trace_syscall_exit(struct trace_event_raw_sys_exit *ctx)
{
u32 pid = bpf_get_current_pid_tgid() >> 32;
struct syscall_data *data;
data = bpf_map_lookup_elem(&syscall_start, &pid);
if (!data)
return 0;
u64 latency_ns = bpf_ktime_get_ns() - data->start_ns;
u64 latency_us = latency_ns / 1000;
// Log high latency syscalls
if (latency_us > 1000) { // > 1ms
bpf_printk("High latency syscall: nr=%lld latency=%llu us\n",
data->syscall_nr, latency_us);
}
// Update histogram
u64 slot = data->syscall_nr;
bpf_map_update_elem(&syscall_latency, &slot, &latency_us, BPF_ANY);
bpf_map_delete_elem(&syscall_start, &pid);
return 0;
}
char LICENSE[] SEC("license") = "GPL";
Compiling and Loading Custom eBPF
#!/bin/bash
# compile_and_load_ebpf.sh
# Compile eBPF program
clang -O2 -target bpf -c syscall_latency.c -o syscall_latency.o
# Load eBPF program
sudo bpftool prog load syscall_latency.o /sys/fs/bpf/syscall_latency
# Attach to tracepoints
PROG_ID=$(sudo bpftool prog show name trace_syscall_enter | grep -o 'id [0-9]*' | awk '{print $2}')
sudo bpftool link create /sys/fs/bpf/syscall_enter_link \
prog_id $PROG_ID \
type tracepoint \
tracepoint raw_syscalls sys_enter
# Monitor output
sudo cat /sys/kernel/debug/tracing/trace_pipe | grep syscall
Using bpftrace for Quick Analysis
One-liner Examples
# Count system calls by process
sudo bpftrace -e 'tracepoint:raw_syscalls:sys_enter { @[comm] = count(); }'
# Monitor process execution
sudo bpftrace -e 'tracepoint:syscalls:sys_enter_execve { printf("%s called execve(%s)\n", comm, str(args->filename)); }'
# Track TCP retransmissions
sudo bpftrace -e 'kprobe:tcp_retransmit_skb { @retransmits[comm] = count(); }'
# Monitor page faults by process
sudo bpftrace -e 'tracepoint:exceptions:page_fault_user { @faults[comm] = count(); }'
# Trace slow disk I/O (> 10ms)
sudo bpftrace -e 'kprobe:blk_account_io_start { @start[arg0] = nsecs; }
kprobe:blk_account_io_done /@start[arg0]/ {
$lat = (nsecs - @start[arg0]) / 1000000;
if ($lat > 10) {
printf("%s: %d ms\n", comm, $lat);
}
delete(@start[arg0]);
}'
Complex bpftrace Scripts
#!/usr/bin/env bpftrace
# io_latency.bt - I/O latency distribution by device
BEGIN
{
printf("Tracing block I/O latency... Hit Ctrl-C to end.\n");
}
kprobe:blk_account_io_start
{
@start[arg0] = nsecs;
}
kprobe:blk_account_io_done
/@start[arg0]/
{
$latency_us = (nsecs - @start[arg0]) / 1000;
@latency_dist = hist($latency_us);
if ($latency_us > 10000) { // > 10ms
time("%H:%M:%S ");
printf("Slow I/O: %s %d us\n", comm, $latency_us);
}
delete(@start[arg0]);
}
END
{
clear(@start);
}
Production Deployment Best Practices
Performance Impact Monitoring
#!/bin/bash
# monitor_ebpf_overhead.sh
# Check CPU overhead
echo "=== eBPF CPU Overhead ==="
sudo bpftool prog show | while read line; do
if [[ $line =~ "prog_id:"([0-9]+) ]]; then
PROG_ID=${BASH_REMATCH[1]}
sudo bpftool prog profile id $PROG_ID duration 10 cycles instructions
fi
done
# Check memory usage
echo "=== eBPF Memory Usage ==="
sudo bpftool map show | while read line; do
if [[ $line =~ "map_id:"([0-9]+) ]]; then
MAP_ID=${BASH_REMATCH[1]}
sudo bpftool map show id $MAP_ID
fi
done
# Monitor verifier statistics
echo "=== Verifier Statistics ==="
sudo cat /proc/sys/kernel/bpf_stats_enabled
sudo sysctl -w kernel.bpf_stats_enabled=1
Security Considerations
# ebpf-security-policy.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: ebpf-security-config
data:
policy.conf: |
# eBPF Security Policy
# Restrict eBPF usage to specific users/groups
allowed_users:
- root
- ebpf-admin
# Limit program types
allowed_prog_types:
- BPF_PROG_TYPE_KPROBE
- BPF_PROG_TYPE_TRACEPOINT
- BPF_PROG_TYPE_PERF_EVENT
# Deny dangerous helpers
denied_helpers:
- bpf_override_return
- bpf_probe_write_user
# Resource limits
max_programs: 100
max_maps: 200
max_map_entries: 100000
# Audit requirements
audit_enabled: true
audit_log: /var/log/ebpf-audit.log
Integrating with Monitoring Stack
Prometheus Exporter
#!/usr/bin/env python3
# ebpf_prometheus_exporter.py
from prometheus_client import start_http_server, Gauge, Counter
from bcc import BPF
import time
# Prometheus metrics
tcp_connections = Gauge('ebpf_tcp_connections_total',
'Total TCP connections',
['state'])
syscall_latency = Gauge('ebpf_syscall_latency_microseconds',
'System call latency',
['syscall'])
file_operations = Counter('ebpf_file_operations_total',
'File operations',
['operation', 'user'])
# eBPF program (simplified)
bpf_text = """
// eBPF collection code here
"""
def collect_metrics():
b = BPF(text=bpf_text)
# Attach probes and collect metrics
while True:
# Update Prometheus metrics from eBPF maps
tcp_connections.labels(state='established').set(100)
syscall_latency.labels(syscall='read').set(45.2)
file_operations.labels(operation='open', user='root').inc()
time.sleep(10)
if __name__ == '__main__':
# Start Prometheus metrics server
start_http_server(9090)
collect_metrics()
Grafana Dashboard Configuration
{
"dashboard": {
"title": "eBPF System Observability",
"panels": [
{
"title": "System Call Latency Heatmap",
"targets": [{
"expr": "rate(ebpf_syscall_latency_microseconds[5m])"
}],
"type": "heatmap"
},
{
"title": "TCP Connection States",
"targets": [{
"expr": "ebpf_tcp_connections_total"
}],
"type": "graph"
},
{
"title": "File Operations by User",
"targets": [{
"expr": "rate(ebpf_file_operations_total[5m])"
}],
"type": "bargauge"
}
]
}
}
Troubleshooting eBPF Programs
Common Issues and Solutions
#!/bin/bash
# troubleshoot_ebpf.sh
echo "=== eBPF Troubleshooting ==="
# Check kernel version and config
echo "1. Kernel Support:"
uname -r
grep -E "CONFIG_BPF|CONFIG_HAVE_EBPF_JIT" /boot/config-$(uname -r)
# Check available tracepoints
echo -e "\n2. Available Tracepoints:"
sudo ls /sys/kernel/debug/tracing/events/ | head -10
# Check loaded programs
echo -e "\n3. Loaded eBPF Programs:"
sudo bpftool prog list
# Check verifier logs
echo -e "\n4. Recent Verifier Errors:"
sudo dmesg | grep -i "bpf" | tail -20
# Check permissions
echo -e "\n5. BPF Permissions:"
ls -la /sys/fs/bpf/
getcap /usr/bin/bpftrace
# Resource usage
echo -e "\n6. BPF Resource Usage:"
sudo bpftool map list | wc -l
sudo bpftool prog list | wc -l
Conclusion
eBPF transforms system observability on AlmaLinux, providing unprecedented visibility into kernel and application behavior with minimal overhead. By implementing the tools and techniques covered in this guide, you can build comprehensive observability solutions that enhance performance monitoring, security analysis, and troubleshooting capabilities.
The combination of eBPF’s safety guarantees, performance efficiency, and programmability makes it an essential technology for modern Linux system administration, enabling insights that were previously impossible or required kernel modifications.