%PDF- %PDF-
Mini Shell

Mini Shell

Direktori : /sbin/
Upload File :
Create Path :
Current File : //sbin/compactsnoop-bpfcc

#! /usr/bin/python3
# @lint-avoid-python-3-compatibility-imports
#
# compactsnoop  Trace compact zone and print details including issuing PID.
#       For Linux, uses BCC, eBPF.
#
# This uses in-kernel eBPF maps to cache process details (PID and comm) by
# compact zone begin, as well as a starting timestamp for calculating
# latency.
#
# Copyright (c) 2019 Wenbo Zhang
# Licensed under the Apache License, Version 2.0 (the "License")
#
# 11-NOV-2019   Wenbo Zhang   Created this.

from __future__ import print_function
from bcc import BPF
import argparse
import platform
from datetime import datetime, timedelta
import sys

# arguments
examples = """examples:
    ./compactsnoop          # trace all compact stall
    ./compactsnoop -T       # include timestamps
    ./compactsnoop -d 10    # trace for 10 seconds only
    ./compactsnoop -K       # output kernel stack trace
    ./compactsnoop -e       # show extended fields
"""

parser = argparse.ArgumentParser(
    description="Trace compact zone",
    formatter_class=argparse.RawDescriptionHelpFormatter,
    epilog=examples,
)
parser.add_argument("-T", "--timestamp", action="store_true",
        help="include timestamp on output")
parser.add_argument("-p", "--pid", help="trace this PID only")
parser.add_argument("-d", "--duration",
        help="total duration of trace in seconds")
parser.add_argument("-K", "--kernel-stack", action="store_true",
        help="output kernel stack trace")
parser.add_argument("-e", "--extended_fields", action="store_true",
        help="show system memory state")
parser.add_argument("--ebpf", action="store_true", help=argparse.SUPPRESS)
args = parser.parse_args()
debug = 0
if args.duration:
    args.duration = timedelta(seconds=int(args.duration))

NO_EXTENDED = """
#ifdef EXTNEDED_FIELDS
#undef EXTNEDED_FIELDS
#endif
"""

EXTENDED = """
#define EXTNEDED_FIELDS    1
"""

bpf_text = """
#include <uapi/linux/ptrace.h>
#include <linux/sched.h>
#include <linux/mmzone.h>
#include <linux/compaction.h>

struct val_t {
    int nid;
    int idx;
    int order;
    int sync;
#ifdef EXTNEDED_FIELDS
    int fragindex;
    int low;
    int min;
    int high;
    int free;
#endif
    u64 ts;    // compaction begin time
};

struct data_t {
    u32 pid;
    u32 tid;
    int nid;
    int idx;
    int order;
    u64 delta;
    u64 ts;    // compaction end time
    int sync;
#ifdef EXTNEDED_FIELDS
    int fragindex;
    int low;
    int min;
    int high;
    int free;
#endif
    int status;
    int stack_id;
    char comm[TASK_COMM_LEN];
};

BPF_HASH(start, u64, struct val_t);
BPF_PERF_OUTPUT(events);
BPF_STACK_TRACE(stack_traces, 2048);

#ifdef CONFIG_NUMA
static inline int zone_to_nid_(struct zone *zone)
{
    int node;
    bpf_probe_read_kernel(&node, sizeof(node), &zone->node);
    return node;
}
#else
static inline int zone_to_nid_(struct zone *zone)
{
    return 0;
}
#endif

// #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones)
static inline int zone_idx_(struct zone *zone)
{
    struct pglist_data *zone_pgdat = NULL;
    bpf_probe_read_kernel(&zone_pgdat, sizeof(zone_pgdat), &zone->zone_pgdat);
    return ((u64)zone - (u64)zone_pgdat->node_zones)/sizeof(struct zone);
}

#ifdef EXTNEDED_FIELDS
static inline void get_all_wmark_pages(struct zone *zone, struct val_t *valp)
{
    u64 _watermark[NR_WMARK] = {};
    u64 watermark_boost = 0;

    bpf_probe_read_kernel(&_watermark, sizeof(_watermark), &zone->_watermark);
    bpf_probe_read_kernel(&watermark_boost, sizeof(watermark_boost),
                    &zone->watermark_boost);
    valp->min = _watermark[WMARK_MIN] + watermark_boost;
    valp->low = _watermark[WMARK_LOW] + watermark_boost;
    valp->high = _watermark[WMARK_HIGH] + watermark_boost;
    bpf_probe_read_kernel(&valp->free, sizeof(valp->free),
                    &zone->vm_stat[NR_FREE_PAGES]);
}
#endif

static inline void submit_event(void *ctx, int status)
{
    struct data_t data = {};
    u64 ts = bpf_ktime_get_ns();
    u64 id = bpf_get_current_pid_tgid();
    struct val_t *valp = start.lookup(&id);
    if (valp == NULL) {
        // missed entry
        return;
    }

    data.delta = ts - valp->ts;
    data.ts = ts / 1000;
    data.pid = id >> 32;
    data.tid = id;
    bpf_get_current_comm(&data.comm, sizeof(data.comm));
    data.nid = valp->nid;
    data.idx = valp->idx;
    data.order = valp->order;
    data.sync = valp->sync;

#ifdef EXTNEDED_FIELDS
    data.fragindex = valp->fragindex;
    data.min = valp->min;
    data.low = valp->low;
    data.high = valp->high;
    data.free = valp->free;
#endif

    data.status = status;
    data.stack_id = stack_traces.get_stackid(ctx, 0);

    events.perf_submit(ctx, &data, sizeof(data));

    start.delete(&id);
}

#ifdef EXTNEDED_FIELDS
int trace_fragmentation_index_return(struct pt_regs *ctx)
{
    struct val_t val = { };
    int ret = PT_REGS_RC(ctx);
    u64 id = bpf_get_current_pid_tgid();
    PID_FILTER
    val.fragindex = ret;
    start.update(&id, &val);
    return 0;
}
#endif

static inline void fill_compact_info(struct val_t *valp,
                                     struct zone *zone,
                                     int order)
{
    valp->nid = zone_to_nid_(zone);
    valp->idx = zone_idx_(zone);
    valp->order = order;
}

RAW_TRACEPOINT_PROBE(mm_compaction_suitable)
{
    // TP_PROTO(struct zone *zone, int order, int ret)
    struct zone *zone = (struct zone *)ctx->args[0];
    int order = (int)ctx->args[1];
    int ret = (int)ctx->args[2];
    u64 id;

    if(ret != COMPACT_CONTINUE)
        return 0;

    id = bpf_get_current_pid_tgid();
    PID_FILTER

#ifdef EXTNEDED_FIELDS
    struct val_t *valp = start.lookup(&id);
    if (valp == NULL) {
        // missed entry or order <= PAGE_ALLOC_COSTLY_ORDER, eg:
        // manual trigger echo 1 > /proc/sys/vm/compact_memory
        struct val_t val = { .fragindex = -1000 };
        valp = &val;
        start.update(&id, valp);
    }
    fill_compact_info(valp, zone, order);
    get_all_wmark_pages(zone, valp);
#else
    struct val_t val = { };
    fill_compact_info(&val, zone, order);
    start.update(&id, &val);
#endif

    return 0;
}

TRACEPOINT_PROBE(compaction, mm_compaction_begin)
{
    bool sync = args->sync;

    u64 id = bpf_get_current_pid_tgid();
    struct val_t *valp = start.lookup(&id);
    if (valp == NULL) {
        // missed entry
        return 0;
    }

    valp->ts = bpf_ktime_get_ns();
    valp->sync = sync;
    return 0;
}

TRACEPOINT_PROBE(compaction, mm_compaction_end)
{
    submit_event(args, args->status);
    return 0;
}
"""

if platform.machine() != 'x86_64' and platform.machine() != 'ppc64le':
    print("""
          Currently only support x86_64 and power servers, if you want
          to use it on other platforms(including power embedded processors),
          please refer include/linux/mmzone.h to modify zone_idex_to_str to
          get the right zone type
    """)
    exit()

if args.extended_fields:
    bpf_text = EXTENDED + bpf_text
else:
    bpf_text = NO_EXTENDED + bpf_text

if args.pid:
    bpf_text = bpf_text.replace("PID_FILTER",
                                "if (id >> 32 != %s) { return 0; }" % args.pid)
else:
    bpf_text = bpf_text.replace("PID_FILTER", "")
if debug or args.ebpf:
    print(bpf_text)
    if args.ebpf:
        exit()

# load BPF program
b = BPF(text=bpf_text)
if args.extended_fields:
    b.attach_kretprobe(event="fragmentation_index",
                       fn_name="trace_fragmentation_index_return")

stack_traces = b.get_table("stack_traces")
initial_ts = 0

def zone_idx_to_str(idx):
    # from include/linux/mmzone.h
    # NOTICE: consider only x86_64 servers
    zone_type = {
            'x86_64':
                    {
                        0: "ZONE_DMA",
                        1: "ZONE_DMA32",
                        2: "ZONE_NORMAL"
                    },
            # Zones in Power server only
            'ppc64le':
                    {
                        0: "ZONE_NORMAL",
                        1: "ZONE_MOVABLE"
                    }
    }

    if idx in zone_type[platform.machine()]:
        return zone_type[platform.machine()][idx]
    else:
        return str(idx)

def compact_result_to_str(status):
    # from include/trace/evnets/mmflags.h
    # from include/linux/compaction.h
    compact_status = {
        # COMPACT_NOT_SUITABLE_ZONE: For more detailed tracepoint
        # output - internal to compaction
        0: "not_suitable_zone",
        # COMPACT_SKIPPED: compaction didn't start as it was not
        # possible or direct reclaim was more suitable
        1: "skipped",
        # COMPACT_DEFERRED: compaction didn't start as it was
        # deferred due to past failures
        2: "deferred",
        # COMPACT_NOT_SUITABLE_PAGE: For more detailed tracepoint
        # output - internal to compaction
        3: "no_suitable_page",
        # COMPACT_CONTINUE: compaction should continue to another pageblock
        4: "continue",
        # COMPACT_COMPLETE: The full zone was compacted scanned but wasn't
        # successful to compact suitable pages.
        5: "complete",
        # COMPACT_PARTIAL_SKIPPED: direct compaction has scanned part of the
        # zone but wasn't successful to compact suitable pages.
        6: "partial_skipped",
        # COMPACT_CONTENDED: compaction terminated prematurely due to lock
        # contentions
        7: "contended",
        # COMPACT_SUCCESS: direct compaction terminated after concluding
        # that the allocation should now succeed
        8: "success",
    }

    if status in compact_status:
        return compact_status[status]
    else:
        return str(status)

# header
if args.timestamp:
    print("%-14s" % ("TIME(s)"), end=" ")
print("%-14s %-6s %-4s %-12s %-5s %-7s" %
      ("COMM", "PID", "NODE", "ZONE", "ORDER", "MODE"), end=" ")
if args.extended_fields:
    print("%-8s %-8s %-8s %-8s %-8s" %
          ("FRAGIDX", "MIN", "LOW", "HIGH", "FREE"), end=" ")
print("%9s %16s" % ("LAT(ms)", "STATUS"))

# process event
def print_event(cpu, data, size):
    event = b["events"].event(data)

    global initial_ts

    if not initial_ts:
        initial_ts = event.ts

    if args.timestamp:
        delta = event.ts - initial_ts
        print("%-14.9f" % (float(delta) / 1000000), end=" ")

    print("%-14.14s %-6s %-4s %-12s %-5s %-7s" % (
            event.comm.decode("utf-8", "replace"),
            event.pid,
            event.nid,
            zone_idx_to_str(event.idx),
            event.order,
            "SYNC" if event.sync else "ASYNC"), end=" ")
    if args.extended_fields:
        print("%-8.3f %-8s %-8s %-8s %-8s" % (
            (float(event.fragindex) / 1000),
            event.min, event.low, event.high, event.free
            ), end=" ")
    print("%9.3f %16s" % (
        float(event.delta) / 1000000, compact_result_to_str(event.status)))
    if args.kernel_stack:
        for addr in stack_traces.walk(event.stack_id):
            sym = b.ksym(addr, show_offset=True)
            print("\t%s" % sym)
        print("")

    sys.stdout.flush()

# loop with callback to print_event
b["events"].open_perf_buffer(print_event, page_cnt=64)
start_time = datetime.now()
while not args.duration or datetime.now() - start_time < args.duration:
    try:
        b.perf_buffer_poll()
    except KeyboardInterrupt:
        exit()

Zerion Mini Shell 1.0