From 642ffe8361670a80417e077292b4d3582b4183bc Mon Sep 17 00:00:00 2001
From: Gabriel Ferreira <gabrielcarvfer@gmail.com>
Date: Fri, 14 Oct 2022 12:59:30 -0300
Subject: [PATCH] build, doc: add options to run with Heaptrack, Memray or Perf
 profilers

---
 doc/manual/source/profiling.rst | 134 +++++++++++++++++++++++++++++++-
 ns3                             |  52 ++++++++++---
 2 files changed, 172 insertions(+), 14 deletions(-)

diff --git a/doc/manual/source/profiling.rst b/doc/manual/source/profiling.rst
index f6700a766..d27d3df32 100644
--- a/doc/manual/source/profiling.rst
+++ b/doc/manual/source/profiling.rst
@@ -225,13 +225,13 @@ If you prefer to use the ``ns3`` wrapper, try:
 
 .. sourcecode:: console
 
-   ~ns-3-dev/$ ./ns3 run "wifi-he-network --simulationTime=0.3 --frequency=5 --useRts=1 --minExpectedThroughput=6 --maxExpectedThroughput=745" --command-template "heaptrack %s" --no-build
+   ~ns-3-dev/$ ./ns3 run "wifi-he-network --simulationTime=0.3 --frequency=5 --useRts=1 --minExpectedThroughput=6 --maxExpectedThroughput=745" --heaptrack --no-build
 
 In both cases, heaptrack will print to the terminal the output file:
 
 .. sourcecode:: console
 
-    ~ns-3-dev/$ ./ns3 run "wifi-he-network --simulationTime=0.3 --frequency=5 --useRts=1 --minExpectedThroughput=6 --maxExpectedThroughput=745" --command-template "heaptrack %s" --no-build
+    ~ns-3-dev/$ ./ns3 run "wifi-he-network --simulationTime=0.3 --frequency=5 --useRts=1 --minExpectedThroughput=6 --maxExpectedThroughput=745" --heaptrack --no-build
     heaptrack output will be written to "~ns-3-dev/heaptrack.ns3-dev-wifi-he-network.210305.zst"
     starting application, this might take some time...
     MCS value               Channel width           GI                      Throughput
@@ -388,6 +388,79 @@ were removed, which translates to a 20% reduction. This resulted in a 1.07x spee
 test suite with Valgrind (``./test.py -d -g``) and 1.02x speedup without it.
 
 
+Memray
+++++++
+
+.. _Memray : https://bloomberg.github.io/memray/
+
+`Memray`_ is an utility made by Bloomberg to trace memory allocations of Python programs,
+including native code called by them. Along with stack traces, developers can trace down
+possible memory leaks and unnecessary allocations.
+
+Note: Memray is ineffective for profiling the ns-3 python bindings since Cppyy hides away
+the calls to the ns-3 module libraries. However, it is still useful for python scripts
+in general, for example ones used to parse and consolidate simulation results.
+
+The ``ns3`` script includes a run option to launch Python programs with Memray.
+Memray can produce different types of reports, such as a flamegraph in HTML, or
+text reports (``summary`` and ``stats``).
+
+.. sourcecode:: console
+
+    ~/ns-3-dev/$ ./ns3 run sample-rng-plot.py --memray
+    Writing profile results into memray.output
+    Memray WARNING: Correcting symbol for aligned_alloc from 0x7fd97023c890 to 0x7fd97102fce0
+    [memray] Successfully generated profile results.
+
+    You can now generate reports from the stored allocation records.
+    Some example commands to generate reports:
+
+    /usr/bin/python3 -m memray flamegraph memray.output
+    ~/ns-3-dev$ /usr/bin/python3 -m memray stats memray.output
+       Total allocations:
+             5364235
+
+       Total memory allocated:
+             10.748GB
+
+       Histogram of allocation size:
+             min: 0.000B
+             ----------------------------------------------
+             < 8.000B   :  264149 |||
+             < 78.000B  : 2051906 |||||||||||||||||||||||
+             < 699.000B : 2270941 |||||||||||||||||||||||||
+             < 6.064KB  :  608993 |||||||
+             < 53.836KB :  165307 ||
+             < 477.912KB:    2220 |
+             < 4.143MB  :     511 |
+             < 36.779MB :     188 |
+             < 326.492MB:      19 |
+             <=2.830GB  :       1 |
+             ----------------------------------------------
+             max: 2.830GB
+
+       Allocator type distribution:
+              MALLOC: 4647765
+              CALLOC: 435525
+              REALLOC: 277736
+              POSIX_MEMALIGN: 2686
+              MMAP: 523
+
+       Top 5 largest allocating locations (by size):
+             - include:/usr/local/lib/python3.10/dist-packages/cppyy/__init__.py:243 -> 8.814GB
+             - <stack trace unavailable> -> 746.999MB
+             - show:~/.local/lib/python3.10/site-packages/matplotlib/backends/backend_gtk4.py:340 -> 263.338MB
+             - load_library:/usr/local/lib/python3.10/dist-packages/cppyy/__init__.py:235 -> 245.684MB
+             - __init__:/usr/lib/python3.10/ctypes/__init__.py:374 -> 225.797MB
+
+       Top 5 largest allocating locations (by number of allocations):
+             - include:/usr/local/lib/python3.10/dist-packages/cppyy/__init__.py:243 -> 2246145
+             - show:~/.local/lib/python3.10/site-packages/matplotlib/backends/backend_gtk4.py:340 -> 1264614
+             - <stack trace unavailable> -> 1098543
+             - __init__:~/.local/lib/python3.10/site-packages/matplotlib/backends/backend_gtk4.py:61 -> 89466
+             - run:/usr/lib/python3/dist-packages/gi/overrides/Gio.py:42 -> 79582
+
+
 Performance Profilers
 *********************
 
@@ -458,6 +531,63 @@ to the ``perf.data`` output file.
 
     ~/ns-3-dev$ ./ns3 run "wifi-he-network --simulationTime=0.3 --frequency=5 --useRts=1 --minExpectedThroughput=6 --maxExpectedThroughput=745" --command-template "perf record -o ./perf.data --call-graph dwarf --event cycles,cache-misses,branch-misses --sample-cpu %s" --no-build
 
+For ease of use, ``ns3`` also provides the ``--perf`` run option, that
+include the recommended settings.
+
+.. sourcecode:: console
+
+    ~/ns-3-dev$ ./ns3 run "wifi-he-network --simulationTime=0.3 --frequency=5 --useRts=1 --minExpectedThroughput=6 --maxExpectedThroughput=745" --perf --no-build
+
+When running for the first time, you may receive the following error:
+
+.. sourcecode:: console
+
+    ~/ns-3-dev$ ./ns3 run "wifi-he-network --simulationTime=0.3 --frequency=5 --useRts=1 --minExpectedThroughput=6 --maxExpectedThroughput=745" --perf --no-build
+    Error:
+    Access to performance monitoring and observability operations is limited.
+    Consider adjusting /proc/sys/kernel/perf_event_paranoid setting to open
+    access to performance monitoring and observability operations for processes
+    without CAP_PERFMON, CAP_SYS_PTRACE or CAP_SYS_ADMIN Linux capability.
+    More information can be found at 'Perf events and tool security' document:
+    https://www.kernel.org/doc/html/latest/admin-guide/perf-security.html
+    perf_event_paranoid setting is 1:
+      -1: Allow use of (almost) all events by all users
+          Ignore mlock limit after perf_event_mlock_kb without CAP_IPC_LOCK
+    >= 0: Disallow raw and ftrace function tracepoint access
+    >= 1: Disallow CPU event access
+    >= 2: Disallow kernel profiling
+    To make the adjusted perf_event_paranoid setting permanent preserve it
+    in /etc/sysctl.conf (e.g. kernel.perf_event_paranoid = <setting>)
+    Command 'build/examples/wireless/ns3-dev-wifi-he-network-default record --call-graph dwarf -a -e cache-misses,branch-misses,cpu-cycles,instructions,context-switches build/examples/wireless/ns3-dev-wifi-he-network-default -n=100' returned non-zero exit status 255.
+
+This error is related to lacking permissions to access performance events from the kernel and CPU.
+As said in the error, permissions can be granted for the current session
+by changing the ``perf_event_paranoid`` setting with ``echo 0 > /proc/sys/kernel/perf_event_paranoid``.
+This change can be made permanent by changing the setting in ``/etc/sysctl.conf``, but
+this is not recommended. Administrative permissions (``sudo su``) are required in both cases.
+
+After the program finishes, it will print recording statistics.
+
+.. sourcecode:: console
+
+    MCS value               Channel width           GI                      Throughput
+    0                       20 MHz                  3200 ns                 6.01067 Mbit/s
+    0                       20 MHz                  1600 ns                 5.936 Mbit/s
+    ...
+    11                      160 MHz                 1600 ns                 493.397 Mbit/s
+    11                      160 MHz                 800 ns                  534.016 Mbit/s
+    [ perf record: Woken up 9529 times to write data ]
+    Warning:
+    Processed 517638 events and lost 94 chunks!
+
+    Check IO/CPU overload!
+
+    Warning:
+    1 out of order events recorded.
+    [ perf record: Captured and wrote 2898,307 MB perf.data (436509 samples) ]
+
+
+Results saved in ``perf.data`` can be reviewed with the ``perf report`` command.
 
 `Hotspot`_ is a GUI for Perf, that makes performance profiling more
 enjoyable and productive. It can parse the ``perf.data`` and show in
diff --git a/ns3 b/ns3
index ffd6b0fdc..2d9d5150b 100755
--- a/ns3
+++ b/ns3
@@ -30,7 +30,7 @@ def exit_handler(dry_run):
         return
     if print_buffer == "":
         return
-    print_buffer = print_buffer.replace('\\','/').replace('//','/').replace('/', os.sep)
+    print_buffer = print_buffer.replace('\\', '/').replace('//', '/').replace('/', os.sep)
     if dry_run:
         print("The following commands would be executed:")
     elif run_verbose:
@@ -245,6 +245,15 @@ def parse_args(argv):
     parser_run.add_argument('-g', '--valgrind',
                             help='Change the default command template to run programs with valgrind',
                             action="store_true", default=None)
+    parser_run.add_argument('--memray',
+                            help='Use Memray memory profiler for Python scripts. Output will be saved to memray.output',
+                            action="store_true", default=None)
+    parser_run.add_argument('--heaptrack',
+                            help='Use Heaptrack memory profiler for C++',
+                            action="store_true", default=None)
+    parser_run.add_argument('--perf',
+                            help='Use Linux\'s perf to profile a program',
+                            action="store_true", default=None)
     parser_run.add_argument('--vis', '--visualize',
                             help='Modify --run arguments to enable the visualizer',
                             action="store_true", dest="visualize", default=None)
@@ -706,17 +715,16 @@ def get_program_shortcuts(build_profile, ns3_version):
 
             # Add an additional shortcut with .exe suffix when running on Windows
             if sys.platform == "win32":
-                ns3_program_map[shortcut_path.replace("\\","/")] = [program]
-                ns3_program_map[shortcut_path+".exe"] = [program]
-                ns3_program_map[shortcut_path.replace("\\","/")+".exe"] = [program]
-
+                ns3_program_map[shortcut_path.replace("\\", "/")] = [program]
+                ns3_program_map[shortcut_path + ".exe"] = [program]
+                ns3_program_map[shortcut_path.replace("\\", "/") + ".exe"] = [program]
 
             if source_shortcut:
                 cc_shortcut_path = shortcut_path + ".cc"
                 ns3_program_map[cc_shortcut_path] = [program]
                 if sys.platform == "win32":
                     ns3_program_map[cc_shortcut_path] = [program]
-                    ns3_program_map[cc_shortcut_path.replace("\\","/")] = [program]
+                    ns3_program_map[cc_shortcut_path.replace("\\", "/")] = [program]
 
                 # Store longest shortcut path for collisions
                 if cc_shortcut_path not in longest_shortcut_map:
@@ -1008,6 +1016,7 @@ def check_program_installed(program_name: str) -> str:
         exit(-1)
     return program_path
 
+
 def check_module_installed(module_name: str):
     import importlib
     try:
@@ -1016,6 +1025,7 @@ def check_module_installed(module_name: str):
         print("Python module '{module}' was not found".format(module=module_name))
         exit(-1)
 
+
 def run_step(args, target_to_run, target_args):
     libdir = "%s/lib" % out_dir
 
@@ -1046,13 +1056,23 @@ def run_step(args, target_to_run, target_args):
             target_args = [target_to_run] + target_args
             target_to_run = "python3"
 
+            # running with memray?
+            if args.memray:
+                check_module_installed("memray")
+                target_args = ["-m", "memray", "run", "-o", "memray.output", "--native"] + target_args
+
         # running from ns-3-dev (ns3_path) or cwd
         if args.cwd:
             working_dir = args.cwd
 
+        # running with heaptrack?
+        if args.heaptrack:
+            debugging_software.append(check_program_installed("heaptrack"))
+
         # running valgrind?
         if args.valgrind:
-            debugging_software.extend([check_program_installed("valgrind"), "--leak-check=full", "--show-leak-kinds=all"])
+            debugging_software.extend(
+                [check_program_installed("valgrind"), "--leak-check=full", "--show-leak-kinds=all"])
 
         # running gdb?
         if args.gdb:
@@ -1065,6 +1085,14 @@ def run_step(args, target_to_run, target_args):
         if args.lldb:
             debugging_software.extend([check_program_installed("lldb"), "--"])
 
+        # running with perf?
+        if args.perf:
+            debugging_software.extend([
+                check_program_installed("perf"),
+                "record", "--call-graph", "dwarf", "-a", "-e",
+                "cache-misses,branch-misses,cpu-cycles,stalled-cycles-frontend,stalled-cycles-backend,context-switches"
+            ])
+
         # running with the visualizer?
         if args.visualize:
             target_args.append("--SimulatorImplementationType=ns3::VisualSimulatorImpl")
@@ -1099,10 +1127,10 @@ def run_step(args, target_to_run, target_args):
         try:
             subprocess.run(program_arguments, env=proc_env, cwd=working_dir, shell=use_shell, check=True)
         except subprocess.CalledProcessError as e:
-            # Replace full path to binary to relative path
-            e.cmd[0] = os.path.relpath(target_to_run, ns3_path)
             # Replace list of arguments with a single string
             e.cmd = " ".join(e.cmd)
+            # Replace full path to binary to relative path
+            e.cmd = e.cmd.replace(os.path.abspath(target_to_run), os.path.relpath(target_to_run, ns3_path))
             # Print error message and forward the return code
             print(e)
             exit(e.returncode)
@@ -1130,7 +1158,7 @@ def non_ambiguous_program_target_list(programs: dict) -> list:
 
 
 def print_targets_list(ns3_modules: list, ns3_programs: dict) -> None:
-    def list_to_table(l: list) -> str:
+    def list_to_table(targets_list: list) -> str:
         # Set column width and check how much is space is left at the end
         columnwidth = 30
         try:
@@ -1140,10 +1168,10 @@ def print_targets_list(ns3_modules: list, ns3_programs: dict) -> None:
         dead_space = terminal_width % columnwidth
 
         # Filter the targets with names longer than the column width
-        large_items = list(filter(lambda x: len(x) >= columnwidth, l))
+        large_items = list(filter(lambda x: len(x) >= columnwidth, targets_list))
 
         # Then filter the targets with names shorter than the column width
-        small_items = sorted(list(set(l) - set(large_items)))
+        small_items = sorted(list(set(targets_list) - set(large_items)))
 
         prev_new_line = 0
         output = "\n"