From 642ffe8361670a80417e077292b4d3582b4183bc Mon Sep 17 00:00:00 2001 From: Gabriel Ferreira Date: Fri, 14 Oct 2022 12:59:30 -0300 Subject: [PATCH] build, doc: add options to run with Heaptrack, Memray or Perf profilers --- doc/manual/source/profiling.rst | 134 +++++++++++++++++++++++++++++++- ns3 | 52 ++++++++++--- 2 files changed, 172 insertions(+), 14 deletions(-) diff --git a/doc/manual/source/profiling.rst b/doc/manual/source/profiling.rst index f6700a766..d27d3df32 100644 --- a/doc/manual/source/profiling.rst +++ b/doc/manual/source/profiling.rst @@ -225,13 +225,13 @@ If you prefer to use the ``ns3`` wrapper, try: .. sourcecode:: console - ~ns-3-dev/$ ./ns3 run "wifi-he-network --simulationTime=0.3 --frequency=5 --useRts=1 --minExpectedThroughput=6 --maxExpectedThroughput=745" --command-template "heaptrack %s" --no-build + ~ns-3-dev/$ ./ns3 run "wifi-he-network --simulationTime=0.3 --frequency=5 --useRts=1 --minExpectedThroughput=6 --maxExpectedThroughput=745" --heaptrack --no-build In both cases, heaptrack will print to the terminal the output file: .. sourcecode:: console - ~ns-3-dev/$ ./ns3 run "wifi-he-network --simulationTime=0.3 --frequency=5 --useRts=1 --minExpectedThroughput=6 --maxExpectedThroughput=745" --command-template "heaptrack %s" --no-build + ~ns-3-dev/$ ./ns3 run "wifi-he-network --simulationTime=0.3 --frequency=5 --useRts=1 --minExpectedThroughput=6 --maxExpectedThroughput=745" --heaptrack --no-build heaptrack output will be written to "~ns-3-dev/heaptrack.ns3-dev-wifi-he-network.210305.zst" starting application, this might take some time... MCS value Channel width GI Throughput @@ -388,6 +388,79 @@ were removed, which translates to a 20% reduction. This resulted in a 1.07x spee test suite with Valgrind (``./test.py -d -g``) and 1.02x speedup without it. +Memray +++++++ + +.. _Memray : https://bloomberg.github.io/memray/ + +`Memray`_ is an utility made by Bloomberg to trace memory allocations of Python programs, +including native code called by them. Along with stack traces, developers can trace down +possible memory leaks and unnecessary allocations. + +Note: Memray is ineffective for profiling the ns-3 python bindings since Cppyy hides away +the calls to the ns-3 module libraries. However, it is still useful for python scripts +in general, for example ones used to parse and consolidate simulation results. + +The ``ns3`` script includes a run option to launch Python programs with Memray. +Memray can produce different types of reports, such as a flamegraph in HTML, or +text reports (``summary`` and ``stats``). + +.. sourcecode:: console + + ~/ns-3-dev/$ ./ns3 run sample-rng-plot.py --memray + Writing profile results into memray.output + Memray WARNING: Correcting symbol for aligned_alloc from 0x7fd97023c890 to 0x7fd97102fce0 + [memray] Successfully generated profile results. + + You can now generate reports from the stored allocation records. + Some example commands to generate reports: + + /usr/bin/python3 -m memray flamegraph memray.output + ~/ns-3-dev$ /usr/bin/python3 -m memray stats memray.output + Total allocations: + 5364235 + + Total memory allocated: + 10.748GB + + Histogram of allocation size: + min: 0.000B + ---------------------------------------------- + < 8.000B : 264149 ||| + < 78.000B : 2051906 ||||||||||||||||||||||| + < 699.000B : 2270941 ||||||||||||||||||||||||| + < 6.064KB : 608993 ||||||| + < 53.836KB : 165307 || + < 477.912KB: 2220 | + < 4.143MB : 511 | + < 36.779MB : 188 | + < 326.492MB: 19 | + <=2.830GB : 1 | + ---------------------------------------------- + max: 2.830GB + + Allocator type distribution: + MALLOC: 4647765 + CALLOC: 435525 + REALLOC: 277736 + POSIX_MEMALIGN: 2686 + MMAP: 523 + + Top 5 largest allocating locations (by size): + - include:/usr/local/lib/python3.10/dist-packages/cppyy/__init__.py:243 -> 8.814GB + - -> 746.999MB + - show:~/.local/lib/python3.10/site-packages/matplotlib/backends/backend_gtk4.py:340 -> 263.338MB + - load_library:/usr/local/lib/python3.10/dist-packages/cppyy/__init__.py:235 -> 245.684MB + - __init__:/usr/lib/python3.10/ctypes/__init__.py:374 -> 225.797MB + + Top 5 largest allocating locations (by number of allocations): + - include:/usr/local/lib/python3.10/dist-packages/cppyy/__init__.py:243 -> 2246145 + - show:~/.local/lib/python3.10/site-packages/matplotlib/backends/backend_gtk4.py:340 -> 1264614 + - -> 1098543 + - __init__:~/.local/lib/python3.10/site-packages/matplotlib/backends/backend_gtk4.py:61 -> 89466 + - run:/usr/lib/python3/dist-packages/gi/overrides/Gio.py:42 -> 79582 + + Performance Profilers ********************* @@ -458,6 +531,63 @@ to the ``perf.data`` output file. ~/ns-3-dev$ ./ns3 run "wifi-he-network --simulationTime=0.3 --frequency=5 --useRts=1 --minExpectedThroughput=6 --maxExpectedThroughput=745" --command-template "perf record -o ./perf.data --call-graph dwarf --event cycles,cache-misses,branch-misses --sample-cpu %s" --no-build +For ease of use, ``ns3`` also provides the ``--perf`` run option, that +include the recommended settings. + +.. sourcecode:: console + + ~/ns-3-dev$ ./ns3 run "wifi-he-network --simulationTime=0.3 --frequency=5 --useRts=1 --minExpectedThroughput=6 --maxExpectedThroughput=745" --perf --no-build + +When running for the first time, you may receive the following error: + +.. sourcecode:: console + + ~/ns-3-dev$ ./ns3 run "wifi-he-network --simulationTime=0.3 --frequency=5 --useRts=1 --minExpectedThroughput=6 --maxExpectedThroughput=745" --perf --no-build + Error: + Access to performance monitoring and observability operations is limited. + Consider adjusting /proc/sys/kernel/perf_event_paranoid setting to open + access to performance monitoring and observability operations for processes + without CAP_PERFMON, CAP_SYS_PTRACE or CAP_SYS_ADMIN Linux capability. + More information can be found at 'Perf events and tool security' document: + https://www.kernel.org/doc/html/latest/admin-guide/perf-security.html + perf_event_paranoid setting is 1: + -1: Allow use of (almost) all events by all users + Ignore mlock limit after perf_event_mlock_kb without CAP_IPC_LOCK + >= 0: Disallow raw and ftrace function tracepoint access + >= 1: Disallow CPU event access + >= 2: Disallow kernel profiling + To make the adjusted perf_event_paranoid setting permanent preserve it + in /etc/sysctl.conf (e.g. kernel.perf_event_paranoid = ) + Command 'build/examples/wireless/ns3-dev-wifi-he-network-default record --call-graph dwarf -a -e cache-misses,branch-misses,cpu-cycles,instructions,context-switches build/examples/wireless/ns3-dev-wifi-he-network-default -n=100' returned non-zero exit status 255. + +This error is related to lacking permissions to access performance events from the kernel and CPU. +As said in the error, permissions can be granted for the current session +by changing the ``perf_event_paranoid`` setting with ``echo 0 > /proc/sys/kernel/perf_event_paranoid``. +This change can be made permanent by changing the setting in ``/etc/sysctl.conf``, but +this is not recommended. Administrative permissions (``sudo su``) are required in both cases. + +After the program finishes, it will print recording statistics. + +.. sourcecode:: console + + MCS value Channel width GI Throughput + 0 20 MHz 3200 ns 6.01067 Mbit/s + 0 20 MHz 1600 ns 5.936 Mbit/s + ... + 11 160 MHz 1600 ns 493.397 Mbit/s + 11 160 MHz 800 ns 534.016 Mbit/s + [ perf record: Woken up 9529 times to write data ] + Warning: + Processed 517638 events and lost 94 chunks! + + Check IO/CPU overload! + + Warning: + 1 out of order events recorded. + [ perf record: Captured and wrote 2898,307 MB perf.data (436509 samples) ] + + +Results saved in ``perf.data`` can be reviewed with the ``perf report`` command. `Hotspot`_ is a GUI for Perf, that makes performance profiling more enjoyable and productive. It can parse the ``perf.data`` and show in diff --git a/ns3 b/ns3 index ffd6b0fdc..2d9d5150b 100755 --- a/ns3 +++ b/ns3 @@ -30,7 +30,7 @@ def exit_handler(dry_run): return if print_buffer == "": return - print_buffer = print_buffer.replace('\\','/').replace('//','/').replace('/', os.sep) + print_buffer = print_buffer.replace('\\', '/').replace('//', '/').replace('/', os.sep) if dry_run: print("The following commands would be executed:") elif run_verbose: @@ -245,6 +245,15 @@ def parse_args(argv): parser_run.add_argument('-g', '--valgrind', help='Change the default command template to run programs with valgrind', action="store_true", default=None) + parser_run.add_argument('--memray', + help='Use Memray memory profiler for Python scripts. Output will be saved to memray.output', + action="store_true", default=None) + parser_run.add_argument('--heaptrack', + help='Use Heaptrack memory profiler for C++', + action="store_true", default=None) + parser_run.add_argument('--perf', + help='Use Linux\'s perf to profile a program', + action="store_true", default=None) parser_run.add_argument('--vis', '--visualize', help='Modify --run arguments to enable the visualizer', action="store_true", dest="visualize", default=None) @@ -706,17 +715,16 @@ def get_program_shortcuts(build_profile, ns3_version): # Add an additional shortcut with .exe suffix when running on Windows if sys.platform == "win32": - ns3_program_map[shortcut_path.replace("\\","/")] = [program] - ns3_program_map[shortcut_path+".exe"] = [program] - ns3_program_map[shortcut_path.replace("\\","/")+".exe"] = [program] - + ns3_program_map[shortcut_path.replace("\\", "/")] = [program] + ns3_program_map[shortcut_path + ".exe"] = [program] + ns3_program_map[shortcut_path.replace("\\", "/") + ".exe"] = [program] if source_shortcut: cc_shortcut_path = shortcut_path + ".cc" ns3_program_map[cc_shortcut_path] = [program] if sys.platform == "win32": ns3_program_map[cc_shortcut_path] = [program] - ns3_program_map[cc_shortcut_path.replace("\\","/")] = [program] + ns3_program_map[cc_shortcut_path.replace("\\", "/")] = [program] # Store longest shortcut path for collisions if cc_shortcut_path not in longest_shortcut_map: @@ -1008,6 +1016,7 @@ def check_program_installed(program_name: str) -> str: exit(-1) return program_path + def check_module_installed(module_name: str): import importlib try: @@ -1016,6 +1025,7 @@ def check_module_installed(module_name: str): print("Python module '{module}' was not found".format(module=module_name)) exit(-1) + def run_step(args, target_to_run, target_args): libdir = "%s/lib" % out_dir @@ -1046,13 +1056,23 @@ def run_step(args, target_to_run, target_args): target_args = [target_to_run] + target_args target_to_run = "python3" + # running with memray? + if args.memray: + check_module_installed("memray") + target_args = ["-m", "memray", "run", "-o", "memray.output", "--native"] + target_args + # running from ns-3-dev (ns3_path) or cwd if args.cwd: working_dir = args.cwd + # running with heaptrack? + if args.heaptrack: + debugging_software.append(check_program_installed("heaptrack")) + # running valgrind? if args.valgrind: - debugging_software.extend([check_program_installed("valgrind"), "--leak-check=full", "--show-leak-kinds=all"]) + debugging_software.extend( + [check_program_installed("valgrind"), "--leak-check=full", "--show-leak-kinds=all"]) # running gdb? if args.gdb: @@ -1065,6 +1085,14 @@ def run_step(args, target_to_run, target_args): if args.lldb: debugging_software.extend([check_program_installed("lldb"), "--"]) + # running with perf? + if args.perf: + debugging_software.extend([ + check_program_installed("perf"), + "record", "--call-graph", "dwarf", "-a", "-e", + "cache-misses,branch-misses,cpu-cycles,stalled-cycles-frontend,stalled-cycles-backend,context-switches" + ]) + # running with the visualizer? if args.visualize: target_args.append("--SimulatorImplementationType=ns3::VisualSimulatorImpl") @@ -1099,10 +1127,10 @@ def run_step(args, target_to_run, target_args): try: subprocess.run(program_arguments, env=proc_env, cwd=working_dir, shell=use_shell, check=True) except subprocess.CalledProcessError as e: - # Replace full path to binary to relative path - e.cmd[0] = os.path.relpath(target_to_run, ns3_path) # Replace list of arguments with a single string e.cmd = " ".join(e.cmd) + # Replace full path to binary to relative path + e.cmd = e.cmd.replace(os.path.abspath(target_to_run), os.path.relpath(target_to_run, ns3_path)) # Print error message and forward the return code print(e) exit(e.returncode) @@ -1130,7 +1158,7 @@ def non_ambiguous_program_target_list(programs: dict) -> list: def print_targets_list(ns3_modules: list, ns3_programs: dict) -> None: - def list_to_table(l: list) -> str: + def list_to_table(targets_list: list) -> str: # Set column width and check how much is space is left at the end columnwidth = 30 try: @@ -1140,10 +1168,10 @@ def print_targets_list(ns3_modules: list, ns3_programs: dict) -> None: dead_space = terminal_width % columnwidth # Filter the targets with names longer than the column width - large_items = list(filter(lambda x: len(x) >= columnwidth, l)) + large_items = list(filter(lambda x: len(x) >= columnwidth, targets_list)) # Then filter the targets with names shorter than the column width - small_items = sorted(list(set(l) - set(large_items))) + small_items = sorted(list(set(targets_list) - set(large_items))) prev_new_line = 0 output = "\n"