diff --git a/doc/manual/Makefile b/doc/manual/Makefile index fcd7611a5..9adf44ff7 100644 --- a/doc/manual/Makefile +++ b/doc/manual/Makefile @@ -17,7 +17,6 @@ SOURCES = \ source/attributes.rst \ source/callbacks.rst \ source/develop.rst \ - source/working-with-git.rst \ source/documentation.rst \ source/enable-modules.rst \ source/enable-tests.rst \ @@ -33,6 +32,7 @@ SOURCES = \ source/object-model.rst \ source/object-names.rst \ source/organization.rst \ + source/profiling.rst \ source/python.rst \ source/random-variables.rst \ source/realtime.rst \ @@ -46,6 +46,8 @@ SOURCES = \ source/troubleshoot.rst \ source/utilities.rst \ source/working-with-cmake.rst \ + source/working-with-git.rst \ + source/working-with-gitlab-ci-local.rst \ ${SRC}/stats/doc/data-collection.rst \ ${SRC}/stats/doc/data-collection-overview.rst \ ${SRC}/stats/doc/statistics.rst \ @@ -64,12 +66,30 @@ SOURCES = \ # 3) both a .pdf and .png file SOURCEFIGS = \ - figures/software-organization.dia \ + figures/software-organization.dia \ figures/plot-2d.png \ figures/plot-2d-with-error-bars.png \ figures/plot-3d.png \ figures/gtk-config-lena-dual-stripe-device-view.png \ figures/gtk-config-lena-dual-stripe-eNB-tx-power.png \ + figures/heaptrack.png \ + figures/hotspot-cycles.png \ + figures/hotspot-setup.png \ + figures/hotspot-top-down.png \ + figures/uprof-collect-callstack.png \ + figures/uprof-collect-callstack.png \ + figures/uprof-profile-application.png \ + figures/uprof-select-events.png \ + figures/uprof-start.png \ + figures/uprof-stats.png \ + figures/uprof-strace-lte.png \ + figures/vtune-configure.png \ + figures/vtune-landing.png \ + figures/vtune-perf-snapshot.png \ + figures/vtune-select-uarch-profiling.png \ + figures/vtune-uarch-core-stats.png \ + figures/vtune-uarch-profiling-summary.png \ + figures/vtune-uarch-wifi-stats.png \ ${SRC}/stats/doc/Stat-framework-arch.png \ ${SRC}/stats/doc/Wifi-default.png \ ${SRC}/stats/doc/dcf-overview.dia \ diff --git a/doc/manual/figures/heaptrack.png b/doc/manual/figures/heaptrack.png new file mode 100644 index 000000000..5c17e44bd Binary files /dev/null and b/doc/manual/figures/heaptrack.png differ diff --git a/doc/manual/figures/hotspot-cycles.png b/doc/manual/figures/hotspot-cycles.png new file mode 100644 index 000000000..dbe4326f4 Binary files /dev/null and b/doc/manual/figures/hotspot-cycles.png differ diff --git a/doc/manual/figures/hotspot-setup.png b/doc/manual/figures/hotspot-setup.png new file mode 100644 index 000000000..c53b69343 Binary files /dev/null and b/doc/manual/figures/hotspot-setup.png differ diff --git a/doc/manual/figures/hotspot-top-down.png b/doc/manual/figures/hotspot-top-down.png new file mode 100644 index 000000000..2ba551b6c Binary files /dev/null and b/doc/manual/figures/hotspot-top-down.png differ diff --git a/doc/manual/figures/uprof-collect-callstack.png b/doc/manual/figures/uprof-collect-callstack.png new file mode 100644 index 000000000..6ab49806c Binary files /dev/null and b/doc/manual/figures/uprof-collect-callstack.png differ diff --git a/doc/manual/figures/uprof-profile-application.png b/doc/manual/figures/uprof-profile-application.png new file mode 100644 index 000000000..48c583f2d Binary files /dev/null and b/doc/manual/figures/uprof-profile-application.png differ diff --git a/doc/manual/figures/uprof-select-events.png b/doc/manual/figures/uprof-select-events.png new file mode 100644 index 000000000..97b1e43f6 Binary files /dev/null and b/doc/manual/figures/uprof-select-events.png differ diff --git a/doc/manual/figures/uprof-start.png b/doc/manual/figures/uprof-start.png new file mode 100644 index 000000000..d51ce30c9 Binary files /dev/null and b/doc/manual/figures/uprof-start.png differ diff --git a/doc/manual/figures/uprof-stats.png b/doc/manual/figures/uprof-stats.png new file mode 100644 index 000000000..c7072bd7d Binary files /dev/null and b/doc/manual/figures/uprof-stats.png differ diff --git a/doc/manual/figures/uprof-strace-lte.png b/doc/manual/figures/uprof-strace-lte.png new file mode 100644 index 000000000..e93e1e4bc Binary files /dev/null and b/doc/manual/figures/uprof-strace-lte.png differ diff --git a/doc/manual/figures/vtune-configure.png b/doc/manual/figures/vtune-configure.png new file mode 100644 index 000000000..9a50d7f6c Binary files /dev/null and b/doc/manual/figures/vtune-configure.png differ diff --git a/doc/manual/figures/vtune-landing.png b/doc/manual/figures/vtune-landing.png new file mode 100644 index 000000000..93b5555ec Binary files /dev/null and b/doc/manual/figures/vtune-landing.png differ diff --git a/doc/manual/figures/vtune-perf-snapshot.png b/doc/manual/figures/vtune-perf-snapshot.png new file mode 100644 index 000000000..f57b69777 Binary files /dev/null and b/doc/manual/figures/vtune-perf-snapshot.png differ diff --git a/doc/manual/figures/vtune-select-uarch-profiling.png b/doc/manual/figures/vtune-select-uarch-profiling.png new file mode 100644 index 000000000..364a80ca9 Binary files /dev/null and b/doc/manual/figures/vtune-select-uarch-profiling.png differ diff --git a/doc/manual/figures/vtune-uarch-core-stats.png b/doc/manual/figures/vtune-uarch-core-stats.png new file mode 100644 index 000000000..bf31192fd Binary files /dev/null and b/doc/manual/figures/vtune-uarch-core-stats.png differ diff --git a/doc/manual/figures/vtune-uarch-profiling-summary.png b/doc/manual/figures/vtune-uarch-profiling-summary.png new file mode 100644 index 000000000..74f4c2634 Binary files /dev/null and b/doc/manual/figures/vtune-uarch-profiling-summary.png differ diff --git a/doc/manual/figures/vtune-uarch-wifi-stats.png b/doc/manual/figures/vtune-uarch-wifi-stats.png new file mode 100644 index 000000000..8d7bb1e30 Binary files /dev/null and b/doc/manual/figures/vtune-uarch-wifi-stats.png differ diff --git a/doc/manual/source/develop.rst b/doc/manual/source/develop.rst index 02e1b8899..81e9faf88 100644 --- a/doc/manual/source/develop.rst +++ b/doc/manual/source/develop.rst @@ -3,7 +3,7 @@ Developer Tools =============== -This chapter describes the development ecosystem generaly used to create new modules. +This chapter describes the development ecosystem generally used to create new modules. .. toctree:: :maxdepth: 2 @@ -15,4 +15,6 @@ This chapter describes the development ecosystem generaly used to create new mod new-models new-modules documentation + profiling + working-with-gitlab-ci-local diff --git a/doc/manual/source/profiling.rst b/doc/manual/source/profiling.rst new file mode 100644 index 000000000..c72bf4804 --- /dev/null +++ b/doc/manual/source/profiling.rst @@ -0,0 +1,1036 @@ +.. include:: replace.txt +.. highlight:: bash + +Profiling +--------- + +Memory profiling is essential to identify issues that +may cause memory corruption, which may lead to all sorts of +side-effects, such as crashing after many hours of simulation and +producing wrong results that invalidate the entire simulation. + +It also can help tracking sources of excessive memory allocations, +the size of these allocations and memory usage during simulation. +These can affect simulation performance, or limit the complexity +and the number of concurrent simulations. + +Performance profiling on the other hand is essential for +high-performance applications, as it allows for the identification +of bottlenecks and their mitigation. + +Another type of profiling is related to system calls. They +can be used to debug issues and identify hotspots that +may cause performance issues in specific conditions. Excessive +calls results in more context switches, which interrupt the +simulations, ultimately slowing them down. + +Other than profiling the simulations, which can highlight bottlenecks +in the simulator, we can also profile the compilation process. +This allows us to identify and fix bottlenecks, which speed up +build times. + + +Memory Profilers +**************** + +.. _Sanitizers : https://github.com/google/sanitizers/wiki +.. _Heaptrack : https://apps.kde.org/heaptrack/ +.. _Valgrind: https://valgrind.org/ +.. _MacOS's leaks: https://developer.apple.com/library/archive/documentation/Performance/Conceptual/ManagingMemory/Articles/FindingLeaks.html +.. _Bytehound : https://github.com/koute/bytehound +.. _gperftools : https://github.com/gperftools/gperftools + +Memory profilers are tools that help identifying memory related +issues. + +There are two well known tools for finding bugs such as uninitialized memory usage, +out-of-bound accesses, dereferencing null pointers and other memory-related bugs: + +* `Valgrind`_ + + * Pros: very rich tooling, no need to recompile programs to profile the program. + * Cons: very slow and limited to Linux and MacOS. +* `Sanitizers`_ + + * Pros: sanitizers are distributed along with compilers, such as GCC, Clang and MSVC. + They are widely available, cross platform and faster than Valgrind. + * Cons: false positives, high memory usage, memory sanitizer is incompatible + with other sanitizers (e.g. address sanitizer), requiring two instrumented + compilations and two test runs. The memory sanitizer requires Clang. + +There are also tools to count memory allocations, track memory usage and memory leaks, +such as: `Heaptrack`_, `MacOS's leaks`_, `Bytehound`_ and `gperftools`_. + +An overview on how to use `Valgrind`_, `Sanitizers`_ and +`Heaptrack`_ is provided in the following sections. + +Valgrind +++++++++ + +.. _manual : https://valgrind.org/docs/manual/mc-manual.html + +`Valgrind`_ is suite of profiling tools, being the main tool called Memcheck. +To check for memory errors including leaks, one can call valgrind directly: + +.. sourcecode:: console + + valgrind --leak-check=yes ./relative/path/to/program argument1 argument2 + +Or can use the ``ns3`` script: + +.. sourcecode:: console + + ./ns3 run "program argument1 argument2" --valgrind + +Additional Valgrind options are listed on its `manual`_. + +Sanitizers +++++++++++ + +`Sanitizers`_ are a suite of libraries made by Google and part of the LLVM project, +used to profile programs at runtime and find issues related to undefined behavior, +memory corruption (out-of-bound access, uninitialized memory use), leaks, race +conditions and others. + +Sanitizers are shipped with most modern compilers and can be used by instructing the +compiler to link the required libraries and instrument the code. + +To build ns-3 with sanitizers, enable the ``NS3_SANITIZE`` option. This can be done +directly via CMake: + +.. sourcecode:: console + + ~/ns-3-dev/cmake_cache/$ cmake -DNS3_SANITIZE=ON .. + +Or via the ``ns3`` wrapper: + +.. sourcecode:: console + + ~/ns-3-dev$ ./ns3 configure --enable-sanitizers + +.. _issue 166 : https://gitlab.com/nsnam/ns-3-dev/-/issues/166 +.. _MR703 : https://gitlab.com/nsnam/ns-3-dev/-/merge_requests/703 +.. _MR704 : https://gitlab.com/nsnam/ns-3-dev/-/merge_requests/704 + +The memory sanitizer can be enabled with ``NS3_SANITIZE_MEMORY``, but it is not +compatible with ``NS3_SANITIZE`` and only works with the Clang compiler. + +Sanitizers were used to find issues in multiple occasions: + +* A global buffer overflow in the LTE module + + * When the wrong index (-1) was used to access a ``int [][]`` variable, a different variable that is stored closely in memory was accessed. + * In the best case scenario, this results in reading an incorrect value that causes the program to fail + * In the worst case scenario, this value is overwritten corrupting the program memory + * The likely scenario: wrong value is read and the program continued running, potentially producing incorrect results + + .. sourcecode:: console + + ~/ns-3-dev/src/lte/model/lte-amc.cc:303:43: runtime error: index -1 out of bounds for type 'int [110][27]' + ================================================================= + ==51636==ERROR: AddressSanitizer: global-buffer-overflow on address 0x7fe78cc2dbbc at pc 0x7fe78ba65e65 bp 0x7ffde70b25c0 sp 0x7ffde70b25b0 + READ of size 4 at 0x7fe78cc2dbbc thread T0 + #0 0x7fe78ba65e64 in ns3::LteAmc::GetDlTbSizeFromMcs(int, int) ~/ns-3-dev/src/lte/model/lte-amc.cc:303 + #1 0x7fe78c538aba in ns3::TdTbfqFfMacScheduler::DoSchedDlTriggerReq(ns3::FfMacSchedSapProvider::SchedDlTriggerReqParameters const&) ~/ns-3-dev/src/lte/model/tdtbfq-ff-mac-scheduler.cc:1160 + #2 0x7fe78c564736 in ns3::MemberSchedSapProvider::SchedDlTriggerReq(ns3::FfMacSchedSapProvider::SchedDlTriggerReqParameters const&) ~/ns-3-dev/build/include/ns3/ff-mac-sched-sap.h:409 + #3 0x7fe78c215596 in ns3::LteEnbMac::DoSubframeIndication(unsigned int, unsigned int) ~/ns-3-dev/src/lte/model/lte-enb-mac.cc:588 + #4 0x7fe78c20921d in ns3::EnbMacMemberLteEnbPhySapUser::SubframeIndication(unsigned int, unsigned int) ~/ns-3-dev/src/lte/model/lte-enb-mac.cc:297 + #5 0x7fe78b924105 in ns3::LteEnbPhy::StartSubFrame() ~/ns-3-dev/src/lte/model/lte-enb-phy.cc:764 + #6 0x7fe78b949d54 in ns3::MakeEvent(void (ns3::LteEnbPhy::*)(), ns3::LteEnbPhy*)::EventMemberImpl0::Notify() (~/ns-3-dev/build/lib/libns3-dev-lte-deb.so+0x3a9cd54) + #7 0x7fe795252022 in ns3::EventImpl::Invoke() ~/ns-3-dev/src/core/model/event-impl.cc:51 + #8 0x7fe795260de2 in ns3::DefaultSimulatorImpl::ProcessOneEvent() ~/ns-3-dev/src/core/model/default-simulator-impl.cc:151 + #9 0x7fe795262dbd in ns3::DefaultSimulatorImpl::Run() ~/ns-3-dev/src/core/model/default-simulator-impl.cc:204 + #10 0x7fe79525436f in ns3::Simulator::Run() ~/ns-3-dev/src/core/model/simulator.cc:176 + #11 0x7fe7b0f77ee2 in LteDistributedFfrAreaTestCase::DoRun() ~/ns-3-dev/src/lte/test/lte-test-frequency-reuse.cc:1777 + #12 0x7fe7952d125a in ns3::TestCase::Run(ns3::TestRunnerImpl*) ~/ns-3-dev/src/core/model/test.cc:363 + #13 0x7fe7952d0f4d in ns3::TestCase::Run(ns3::TestRunnerImpl*) ~/ns-3-dev/src/core/model/test.cc:357 + #14 0x7fe7952e39c0 in ns3::TestRunnerImpl::Run(int, char**) ~/ns-3-dev/src/core/model/test.cc:1094 + #15 0x7fe7952e427e in ns3::TestRunner::Run(int, char**) ~/ns-3-dev/src/core/model/test.cc:1118 + #16 0x564a13d67c9c in main ~/ns-3-dev/utils/test-runner.cc:23 + #17 0x7fe793cde0b2 in __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x270b2) + #18 0x564a13d67bbd in _start (~/ns-3-dev/build/utils/test-runner+0xae0bbd) + 0x7fe78cc2dbbc is located 40 bytes to the right of global variable 'McsToItbsUl' defined in '~/ns-3-dev/src/lte/model/lte-amc.cc:105:18' (0x7fe78cc2db20) of size 116 + 0x7fe78cc2dbbc is located 4 bytes to the left of global variable 'TransportBlockSizeTable' defined in '~/ns-3-dev/src/lte/model/lte-amc.cc:118:18' (0x7fe78cc2dbc0) of size 11880 + SUMMARY: AddressSanitizer: global-buffer-overflow ~/ns-3-dev/src/lte/model/lte-amc.cc:303 in ns3::LteAmc::GetDlTbSizeFromMcs(int, int) + Shadow bytes around the buggy address: + 0x0ffd7197db50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 04 f9 + 0x0ffd7197db60: f9 f9 f9 f9 00 00 00 00 00 00 00 00 00 00 00 00 + =>0x0ffd7197db70: 00 00 04 f9 f9 f9 f9[f9]00 00 00 00 00 00 00 00 + Shadow byte legend (one shadow byte represents 8 application bytes): + Addressable: 00 + Partially addressable: 01 02 03 04 05 06 07 + Global redzone: f9 + ==51636==ABORTING + + * The output above shows the type of error (``global-buffer-overflow``), + the stack-trace of where the bug happened (``LteAmc::GetDlTbSizeFromMcs``), + affected variables (``McsToItbsUl`` and ``TransportBlockSizeTable``), + and a shadow bytes map, showing the wrong access between square brackets. + * The the global redzone (f9) shadow bytes are empty memory allocated between global variables (00s and 04s), + which are left there to be corrupted by the bugged program. + Any eventual corruption is then traced back to the source, without affecting the program execution. + * The adopted solution in merge request `MR703`_ was to fix one of the schedulers that could produce the index value of -1, + and updating the asserts to catch the illegal index value. + +* A wrong downcast in the Wimax module: + + * The pointer was casted incorrectly to U16TlvValue instead of U8TvlValue, which could have different sizes in memory + leading to the program reading the wrong memory address. + Reading the wrong memory address can result in unexpected or invalid values being read, which could change the + program flow and corrupt memory, producing wrong simulation results or crashing the program. + + .. sourcecode:: console + + ~/ns-3-dev/src/wimax/model/service-flow.cc:159:86: runtime error: downcast of address 0x6020000148b0 which does not point to an object of type 'U16TlvValue' + 0x6020000148b0: note: object is of type 'ns3::U8TlvValue' + 48 00 00 36 c8 09 02 62 5c 7f 00 00 00 be be be be be be be 03 00 00 00 00 00 00 04 10 00 00 00 + ^~~~~~~~~~~~~~~~~~~~~~~ + vptr for 'ns3::U8TlvValue' + ~/ns-3-dev/src/wimax/model/service-flow.cc:159:99: runtime error: member call on address 0x6020000148b0 which does not point to an object of type 'U16TlvValue' + 0x6020000148b0: note: object is of type 'ns3::U8TlvValue' + 48 00 00 36 c8 09 02 62 5c 7f 00 00 00 be be be be be be be 03 00 00 00 00 00 00 04 10 00 00 00 + ^~~~~~~~~~~~~~~~~~~~~~~ + vptr for 'ns3::U8TlvValue' + ~/ns-3-dev/src/wimax/model/wimax-tlv.cc:589:10: runtime error: member access within address 0x6020000148b0 which does not point to an object of type 'U16TlvValue' + 0x6020000148b0: note: object is of type 'ns3::U8TlvValue' + 48 00 00 36 c8 09 02 62 5c 7f 00 00 00 be be be be be be be 03 00 00 00 00 00 00 04 10 00 00 00 + ^~~~~~~~~~~~~~~~~~~~~~~ + vptr for 'ns3::U8TlvValue' + + * The bug was fixed with the correct cast in merge request `MR704`_. + + +Heaptrack ++++++++++ + +.. _KDE : https://kde.org + +`Heaptrack`_ is an utility made by `KDE`_ to trace memory allocations +along with stack traces, allowing developers to identify code responsible +for possible memory leaks and unnecessary allocations. + +For the examples below we used the default configuration of ns-3, +with the output going to the ``build`` directory. The actual executable +for the ``wifi-he-network`` example is ``./build/examples/wireless/ns3-dev-wifi-he-network``, which is what is +executed by ``./ns3 run wifi-he-network``. + +To collect information of a program (in this case the ``wifi-he-network`` +example), run: + +.. sourcecode:: console + + ~ns-3-dev/$ heaptrack ./build/examples/wireless/ns3-dev-wifi-he-network --simulationTime=0.3 --frequency=5 --useRts=1 --minExpectedThroughput=6 --maxExpectedThroughput=745 + +If you prefer to use the ``ns3`` wrapper, try: + +.. sourcecode:: console + + ~ns-3-dev/$ ./ns3 run "wifi-he-network --simulationTime=0.3 --frequency=5 --useRts=1 --minExpectedThroughput=6 --maxExpectedThroughput=745" --command-template "heaptrack %s" --no-build + +In both cases, heaptrack will print to the terminal the output file: + +.. sourcecode:: console + + ~ns-3-dev/$ ./ns3 run "wifi-he-network --simulationTime=0.3 --frequency=5 --useRts=1 --minExpectedThroughput=6 --maxExpectedThroughput=745" --command-template "heaptrack %s" --no-build + heaptrack output will be written to "~ns-3-dev/heaptrack.ns3-dev-wifi-he-network.210305.zst" + starting application, this might take some time... + MCS value Channel width GI Throughput + 0 20 MHz 3200 ns 5.91733 Mbit/s + 0 20 MHz 1600 ns 5.91733 Mbit/s + ... + 11 160 MHz 1600 ns 479.061 Mbit/s + 11 160 MHz 800 ns 524.459 Mbit/s + heaptrack stats: + allocations: 149185947 + leaked allocations: 10467 + temporary allocations: 21145932 + Heaptrack finished! Now run the following to investigate the data: + + heaptrack --analyze "~/ns-3-dev/heaptrack.ns3-dev-wifi-he-network.210305.zst" + +The output above shows a summary of the stats collected: ~149 million allocations, +~21 million temporary allocations and ~10 thousand possible leaked allocations. + +If ``heaptrack-gui`` is installed, running ``heaptrack`` will launch it. If it is not installed, +the command line interface will be used. + +.. sourcecode:: console + + ~/ns-3-dev$ heaptrack --analyze "~/ns-3-dev/heaptrack.ns3-dev-wifi-he-network.210305.zst" + reading file "~/ns-3-dev/heaptrack.ns3-dev-wifi-he-network.210305.zst" - please wait, this might take some time... + Debuggee command was: ~/ns-3-dev/build/examples/wireless/ns3-dev-wifi-he-network --simulationTime=0.3 --frequency=5 --useRts=1 --minExpectedThroughput=6 --maxExpectedThroughput=745 + finished reading file, now analyzing data: + + MOST CALLS TO ALLOCATION FUNCTIONS + 23447502 calls to allocation functions with 1.12MB peak consumption from + ns3::Packet::Copy() const + in ~/ns-3-dev/build/lib/libns3-dev-network.so + 4320000 calls with 0B peak consumption from: + ns3::UdpSocketImpl::DoSendTo(ns3::Ptr<>, ns3::Ipv4Address, unsigned short, unsigned char) + in ~/ns-3-dev/build/lib/libns3-dev-internet.so + ns3::UdpSocketImpl::DoSend(ns3::Ptr<>) + in ~/ns-3-dev/build/lib/libns3-dev-internet.so + ns3::UdpSocketImpl::Send(ns3::Ptr<>, unsigned int) + in ~/ns-3-dev/build/lib/libns3-dev-internet.so + ns3::Socket::Send(ns3::Ptr<>) + in ~/ns-3-dev/build/lib/libns3-dev-network.so + ns3::UdpClient::Send() + in ~/ns-3-dev/build/lib/libns3-dev-applications.so + ns3::DefaultSimulatorImpl::ProcessOneEvent() + in ~/ns-3-dev/build/lib/libns3-dev-core.so + ns3::DefaultSimulatorImpl::Run() + in ~/ns-3-dev/build/lib/libns3-dev-core.so + main + in ~/ns-3-dev/build/examples/wireless/ns3-dev-wifi-he-network + + ... + + MOST TEMPORARY ALLOCATIONS + 6182320 temporary allocations of 6182701 allocations in total (99.99%) from + ns3::QueueDisc::DropBeforeEnqueue(ns3::Ptr<>, char const*) + in ~/ns-3-dev/build/lib/libns3-dev-traffic-control.so + 1545580 temporary allocations of 1545580 allocations in total (100.00%) from: + std::_Function_handler<>::_M_invoke(std::_Any_data const&, ns3::Ptr<>&&, char const*&&) + in ~/ns-3-dev/build/lib/libns3-dev-traffic-control.so + std::function<>::operator()(ns3::Ptr<>, char const*) const + in ~/ns-3-dev/build/lib/libns3-dev-traffic-control.so + ns3::MemPtrCallbackImpl<>::operator()(ns3::Ptr<>, char const*) + in ~/ns-3-dev/build/lib/libns3-dev-traffic-control.so + ns3::TracedCallback<>::operator()(ns3::Ptr<>, char const*) const + in ~/ns-3-dev/build/lib/libns3-dev-traffic-control.so + ns3::QueueDisc::DropBeforeEnqueue(ns3::Ptr<>, char const*) + in ~/ns-3-dev/build/lib/libns3-dev-traffic-control.so + ns3::CoDelQueueDisc::DoEnqueue(ns3::Ptr<>) + in ~/ns-3-dev/build/lib/libns3-dev-traffic-control.so + ns3::QueueDisc::Enqueue(ns3::Ptr<>) + in ~/ns-3-dev/build/lib/libns3-dev-traffic-control.so + ns3::FqCoDelQueueDisc::DoEnqueue(ns3::Ptr<>) + in ~/ns-3-dev/build/lib/libns3-dev-traffic-control.so + ns3::QueueDisc::Enqueue(ns3::Ptr<>) + in ~/ns-3-dev/build/lib/libns3-dev-traffic-control.so + ns3::TrafficControlLayer::Send(ns3::Ptr<>, ns3::Ptr<>) + in ~/ns-3-dev/build/lib/libns3-dev-traffic-control.so + ns3::Ipv4Interface::Send(ns3::Ptr<>, ns3::Ipv4Header const&, ns3::Ipv4Address) + in ~/ns-3-dev/build/lib/libns3-dev-internet.so + ns3::Ipv4L3Protocol::SendRealOut(ns3::Ptr<>, ns3::Ptr<>, ns3::Ipv4Header const&) + in ~/ns-3-dev/build/lib/libns3-dev-internet.so + ns3::Ipv4L3Protocol::Send(ns3::Ptr<>, ns3::Ipv4Address, ns3::Ipv4Address, unsigned char, ns3::Ptr<>) + in ~/ns-3-dev/build/lib/libns3-dev-internet.so + ns3::UdpL4Protocol::Send(ns3::Ptr<>, ns3::Ipv4Address, ns3::Ipv4Address, unsigned short, unsigned short, ns3::Ptr<>) + in ~/ns-3-dev/build/lib/libns3-dev-internet.so + ns3::UdpSocketImpl::DoSendTo(ns3::Ptr<>, ns3::Ipv4Address, unsigned short, unsigned char) + in ~/ns-3-dev/build/lib/libns3-dev-internet.so + ns3::UdpSocketImpl::DoSend(ns3::Ptr<>) + in ~/ns-3-dev/build/lib/libns3-dev-internet.so + ns3::UdpSocketImpl::Send(ns3::Ptr<>, unsigned int) + in ~/ns-3-dev/build/lib/libns3-dev-internet.so + ns3::Socket::Send(ns3::Ptr<>) + in ~/ns-3-dev/build/lib/libns3-dev-network.so + ns3::UdpClient::Send() + in ~/ns-3-dev/build/lib/libns3-dev-applications.so + ns3::DefaultSimulatorImpl::ProcessOneEvent() + in ~/ns-3-dev/build/lib/libns3-dev-core.so + ns3::DefaultSimulatorImpl::Run() + in ~/ns-3-dev/build/lib/libns3-dev-core.so + main + in ~/ns-3-dev/build/examples/wireless/ns3-dev-wifi-he-network + + ... + + total runtime: 156.30s. + calls to allocation functions: 149185947 (954466/s) + temporary memory allocations: 21757614 (139201/s) + peak heap memory consumption: 4.87MB + peak RSS (including heaptrack overhead): 42.02MB + total memory leaked: 895.45KB + + +The terminal output above lists the most frequently called functions that allocated memory. + +Here is a short description of what each line of the last block of the output means: + +* Allocation functions are all functions that allocated memory, either explicitly + via C-style ``malloc`` and C++ ``new``, or implicitly via RAII and automatic conversions. + +* Temporary memory allocations are allocations that are followed by the + deallocation without modifying the data. + +* Peak heap memory is the maximum memory allocated by the program throughout its execution. + The memory allocator may reuse memory freed by previous destructors, ``del`` and ``free`` calls, + reducing the number of system calls and maximum memory allocated. + +* RSS is the Resident Set Size, which is the amount of physical memory occupied by the process. + +* Total memory leak refers to memory allocated but never freed. This includes static initialization, + so it is not uncommon to be different than 0KB. However this does not mean the program does not + have memory leaks. Other memory profilers such as Valgrind and memory sanitizers are better + suited to track down memory leaks. + +Based on the stack trace, it is fairly easy to locate the corresponding code and act on it to +reduce the number of allocations. + +.. _MR830 : https://gitlab.com/nsnam/ns-3-dev/-/merge_requests/830 + +In the case of ``ns3::QueueDisc::DropBeforeEnqueue`` shown above, the +allocations were caused by the transformation of C strings (``char*``) into C++ strings +(std::string) before performing the search in ns3::QueueDisc::Stats maps. +These unnecessary allocations were prevented by making use of the transparent +comparator ``std::less<>``, part of merge request `MR830`_. + +Heaptrack also has a GUI that provides the same information printed by the command line interface, +but in a more interactive way. + +.. image:: figures/heaptrack.png + +Heaptrack was used in merge request `MR830`_ to track and reduce the number of allocations +in the ``wifi-he-network`` example mentioned above. About 29 million unnecessary allocations +were removed, which translates to a 20% reduction. This resulted in a 1.07x speedup of the +test suite with Valgrind (``./test.py -d -g``) and 1.02x speedup without it. + + +Performance Profilers +********************* + +.. _Perf : https://perf.wiki.kernel.org/index.php/Tutorial +.. _Hotspot : https://github.com/KDAB/hotspot +.. _AMD uProf : https://developer.amd.com/amd-uprof/ +.. _Intel VTune : https://www.intel.com/content/www/us/en/develop/documentation/get-started-with-vtune/top.html +.. _Windows Performance Toolkit : https://docs.microsoft.com/en-us/windows-hardware/test/wpt/ +.. _Sysprof : https://wiki.gnome.org/Apps/Sysprof +.. _Oprofile : https://oprofile.sourceforge.io/faq/ +.. _Gprof : https://sourceware.org/binutils/docs/gprof/ + +Performance profilers are programs that collect runtime information and help to +identify performance bottlenecks. In some cases, they can point out hotspots +and suggest solutions. + +There are many tools to profile your program, including: + +* profilers from CPU manufacturers, such as `AMD uProf`_ and `Intel VTune`_ +* profilers from the operating systems, such as Linux's `Perf`_ and `Windows Performance Toolkit`_ + + * `Perf`_ also has a few graphical user interfaces available, being `Hotspot`_ one of them +* instrumented compilation and auxiliary tools provided by compilers, such as `Gprof`_ +* third-party tools, such as `Sysprof`_ and `Oprofile`_ + +An overview on how to use `Perf`_ with `Hotspot`_, `AMD uProf`_ and +`Intel VTune`_ is provided in the following sections. + +.. _Linux Perf and Hotspot GUI : + +Linux Perf and Hotspot GUI +++++++++++++++++++++++++++ + +`Perf`_ is the kernel tool to measure performance of the Linux kernel, +drivers and user-space applications. + +Perf tracks some performance events, being some of the most important for performance: + +* cycles + + * Clocks (time) spent running. +* cache-misses + + * When either data or instructions were not in the L1/L2 caches, requiring + a L3 or memory access. +* branch-misses + + * How many branch instructions were mispredicted. + Mispredictions causes the CPU to stall and clean the pipeline, + slowing down the program. +* stalled-cycles-frontend + + * Cycles wasted by the processor waiting for the next instruction, + usually due to instruction cache miss or mispredictions. + Starves the CPU pipeline of instructions and slows down the program. +* stalled-cycles-backend + + * Cycles wasted waiting for pipeline resources to finish their work. + Usually waiting for memory read/write, or executing long-latency instructions. + + +Just like with ``heaptrack``, perf can be executed using the ``ns3`` wrapper +command template. In the following command we output perf data from ``wifi-he-network`` +to the ``perf.data`` output file. + +.. sourcecode:: console + + + ~/ns-3-dev$ ./ns3 run "wifi-he-network --simulationTime=0.3 --frequency=5 --useRts=1 --minExpectedThroughput=6 --maxExpectedThroughput=745" --command-template "perf record -o ./perf.data --call-graph dwarf --event cycles,cache-misses,branch-misses --sample-cpu %s" --no-build + + +`Hotspot`_ is a GUI for Perf, that makes performance profiling more +enjoyable and productive. It can parse the ``perf.data`` and show in +a more friendly way. + +To record the same perf.data from Hotspot directly, fill the fields +for working directory, path to the executable, arguments, perf +events to track and output directory for the ``perf.data``. +Then run to start recording. + +.. image:: figures/hotspot-setup.png + +The cycles per function for this program is shown in the following +image. + +.. image:: figures/hotspot-cycles.png + +The data is also presented in a tabular format in the bottom-up, +top-down and caller/callee tabs (top left of the screen). + +.. image:: figures/hotspot-top-down.png + +.. _issue 280 : https://gitlab.com/nsnam/ns-3-dev/-/issues/280 +.. _issue 426 : https://gitlab.com/nsnam/ns-3-dev/-/issues/426 +.. _MR681 : https://gitlab.com/nsnam/ns-3-dev/-/merge_requests/681 +.. _MR685 : https://gitlab.com/nsnam/ns-3-dev/-/merge_requests/685 + +Hotspot was used to identify performance bottlenecks in multiple occasions: + +#. ``wifi-primary-channels`` test suite was extremely slow due to unnecessary RF processing. + The adopted solution was to replace the filtering step of the entire channel to just the desired + sub-band, and assuming sub-bands are uniformly sized, saving multiplications in the integral + used to compute the power of each sub-band. This resulted in a 6x speedup with + ``./ns3 run "test-runner --fullness=TAKES_FOREVER --test-name=wifi-primary-channels"``. + Hotspot was used along with AMD uProf to track this and other bottlenecks in `issue 426`_. + +#. ``WifiMacQueue::TtlExceeded`` dereferenced data out of cache when calling Simulator::Now(). + The adopted solution was to move Simulator::Now() out of TtlExceeded and reuse the value + and inlining TtlExceeded. This resulted in a ~1.20x speedup with the test suite (``./test.py -d``). + Hotspot was used along with AMD uProf to track this and other bottlenecks in `issue 280`_ + and merge request `MR681`_. + +#. MpduAggregator and MsduAggregator required an expensive attribute lookup to get the maximum sizes + from the RegularWifiMac. Bypassing the attribute lookup reduced cache misses and unnecessary branches. + The adopted solution was to move Simulator::Now() out of TtlExceeded and reuse the value + and inlining TtlExceeded. This resulted in a ~1.02x speedup with the test suite (``./test.py -d``). + More details on: merge requests `MR681`_ and `MR685`_. + + +AMD uProf ++++++++++ + +`AMD uProf`_ works much like `Linux Perf and Hotspot GUI`_, but +is available in more platforms (Linux, Windows and BSD) using AMD +processors. Differently from Perf, it provides more performance +trackers for finer analysis. + +To use it, open uProf then click to profile an application. If you +have already profile an application, you can reuse those settings for +another application by clicking in one of the items in the ``Recently Used +Configurations`` section. + +.. image:: figures/uprof-start.png + +Fill the fields with the application path, the arguments and +the working directory. + +You may need to add the LD_LIBRARY_PATH environment variable +(or PATH on Windows), pointing it to the library output +directory (e.g. ``ns-3-dev/build/lib``). + +Then click next: + +.. image:: figures/uprof-profile-application.png + +Now select custom events and pick the events you want. + +The recommended ones for performance profiling are: + * CYCLES_NOT_IN_HALT + + * Clocks (time) spent running. + * RETIRED_INST + + * How many instructions were completed. + * These do not count mispredictions, stalls, etc. + * Instructions per clock (IPC) = RETIRED_INST / CYCLES_NOT_IN_HALT + * RETIRED_BR_INST_MISP + + * How many branch instructions were mispredicted. + * Mispredictions causes the CPU to stall and clean the pipeline, + slowing down the program. + * L2_CACHE_MISS.FROM_L1_IC_MISS + + * L2 cache misses caused by instruction L1 cache misses. + * Results in L3/memory accesses due to missing instructions in L1/L2. + * L2_CACHE_MISS.FROM_L1_DC_MISS + + * L2 cache misses caused by data L1 cache misses. + * Results in L3/memory accesses due to missing instructions in L1/L2 + * MISALIGNED_LOADS + + * Loads not aligned with processor words. + * Might result in additional cache and memory accesses. + +.. image:: figures/uprof-select-events.png + +Now click in advanced options to enable collection of the call stack. + +.. image:: figures/uprof-collect-callstack.png + +Then click ``Start Profile`` and wait for the program to end. +After it finishes you will be greeted with a hotspot summary screen, +but the ``Analyze`` tab (top of the screen) has sub-tabs with more +relevant information. + +In the following image the metrics are shown per module, including the +C library (libc.so.6) which provides the ``malloc`` and ``free`` functions. +Values can be shown in terms of samples or percentages for easier reading +and to decide where to optimize. + +.. image:: figures/uprof-stats.png + + +.. _MR677 : https://gitlab.com/nsnam/ns-3-dev/-/merge_requests/677 +.. _MR680 : https://gitlab.com/nsnam/ns-3-dev/-/merge_requests/680 +.. _MR681 : https://gitlab.com/nsnam/ns-3-dev/-/merge_requests/681 +.. _MR777 : https://gitlab.com/nsnam/ns-3-dev/-/merge_requests/777 +.. _semantic interposition: https://maskray.me/blog/2021-05-09-fno-semantic-interposition + +Here are a few cases where AMD uProf was used to identify performance bottlenecks: + +#. ``WifiMacQueue::TtlExceeded`` dereferenced data out of cache when calling Simulator::Now(). + The adopted solution was to move Simulator::Now() out of TtlExceeded and reuse the value + and inlining TtlExceeded. This resulted in a ~1.20x speedup with the test suite (``./test.py -d``). + More details on: `issue 280`_ and merge request `MR681`_. + +#. ``wifi-primary-channels`` test suite was extremely slow due to unnecessary RF processing. + The adopted solution was to replace the filtering step of the entire channel to just the desired + sub-band, and assuming sub-bands are uniformly sized, saving multiplications in the integral + used to compute the power of each sub-band. This resulted in a 6x speedup with + ``./ns3 run "test-runner --fullness=TAKES_FOREVER --test-name=wifi-primary-channels"``. + More details on: `issue 426`_ and merge request `MR677`_. + +#. Continuing the work on ``wifi-primary-channels`` test suite, profiling showed an excessive + number of cache misses in ``InterferenceHelper::GetNextPosition``. + This function searches for an iterator on a map, which is very fast + if the map is small and fits in the cache, which was not the case. After reviewing the code, + it was noticed in most cases this call was unnecessary as the iterator was already known. + The adopted solution was to reuse the iterator whenever possible. + This resulted in a 1.78x speedup on top of the previous 6x with + ``./ns3 run "test-runner --fullness=TAKES_FOREVER --test-name=wifi-primary-channels"``. + More details on: `issue 426`_ and merge requests `MR677`_ and `MR680`_. + +#. Position-Independent Code libraries (``-fPIC``) have an additional layer of indirection that increases + instruction cache misses. The adopted solution was to disable `semantic interposition`_ with flag + ``-fno-semantic-interposition`` on GCC. This is the default setting on Clang. This results in + approximately 1.14x speedup with ``./test.py -d``. More details on: `MR777`_. + +Note: all speedups above were measured on the same machine. Results may differ based on clock speeds, +cache sizes, number of cores, memory bandwidth and latency, storage throughput and latency. + +Intel VTune ++++++++++++ + +`Intel VTune`_ works much like `Linux Perf and Hotspot GUI`_, but +is available in more platforms (Linux, Windows and Mac) using Intel +processors. Differently from Perf, it provides more performance +trackers for finer analysis. + +When you open the program, you will be greeted by the landing page +shown in the following image. To start a new profiling project, click +in the ``Configure Analysis`` button. If you already have a project, +right-click the entry and click to configure analysis to reuse the +settings. + +.. image:: figures/vtune-landing.png + +A configuration page will open, where you can fill the fields with +the path to the program, arguments, and set working directory and +environment variables. + +Note: in this example on Windows using MinGW, +we need to define the ``PATH`` environment variable with the paths +to both ``~/ns-3-dev/build/lib`` and the MinGW binaries folder +(``~/msys64/mingw64/bin``), which contains essential libraries. +On Linux-like systems you will need to define the +``LD_LIBRARY_PATH`` environment variable instead of ``PATH``. + +Clicking on the ``Performance Snapshot`` shows the different profiling +options. + +.. image:: figures/vtune-configure.png + +If executed as is, a quicker profiling will be executed to +determine what areas should be profiled with more details. +For the specific example, it is indicated that there are +microarchitectural bottlenecks and low parallelism +(not a surprise since ns-3 is single-threaded). + +.. image:: figures/vtune-perf-snapshot.png + +If the ``microarchitecture exploration`` option is selected, cycles, +branch mispredictions, cache misses and other metrics will be collected. + +.. image:: figures/vtune-select-uarch-profiling.png + +After executing the ``microarchitecture exploration``, a summary will +be shown. Hovering the mouse over the red flags will explain what +each sentence means and how it impacts performance. + +.. image:: figures/vtune-uarch-profiling-summary.png + +Clicking in the ``Bottom-up`` tab shows all the information per module. +A plethora of stats such as CPU time, instructions retired, +retiring percentage (how many of the dispatched instructions +were executed until the end, usually lower than 100% because of branch +mispredictions), bad speculation, cache misses, unused load ports, +and more. + +The stats for the wifi module are shown below. The retiring +metric indicates about 40% of dispatched instructions are +executed. The diagram on the right shows the bottleneck is +in the front-end of the pipeline (red), due to high +instruction cache misses, translation lookaside buffer (TLB) +overhead and unknown branches (most likely callbacks). + +.. image:: figures/vtune-uarch-wifi-stats.png + +The stats for the core module are shown below. +More specifically for the ns3::Object::DoGetObject function. +Metrics indicates about 63% of bad speculations. +The diagram on the right shows that there are bottlenecks +both in the front-end and due to bad speculation (red). + +.. image:: figures/vtune-uarch-core-stats.png + + +System calls profilers +********************** + +.. _dtrace : https://github.com/opendtrace/documentation +.. _strace : https://strace.io/ +.. _procmon : https://docs.microsoft.com/en-us/sysinternals/downloads/procmon + +System call profilers collect information on which system +calls were made by a program, how long they took to be +fulfilled and how many of them resulted in errors. + +There are many system call profilers, including `dtrace`_, `strace`_ and `procmon`_. + +An overview on how to use `strace`_ is provided in the following section. + +Strace +++++++ + + +The `strace`_ is a system calls (syscalls) profiler for Linux. It can filter +specific syscalls, or gather stats during the execution. + +To collect statistics, use ``strace -c``: + +.. sourcecode:: console + + ~ns-3-dev/$ ./ns3 run "wifi-he-network --simulationTime=0.3 --frequency=5 --useRts=1 --minExpectedThroughput=6 --maxExpectedThroughput=745" --command-template "strace -c %s" --no-build + MCS value Channel width GI Throughput + 0 20 MHz 3200 ns 5.91733 Mbit/s + ... + 11 160 MHz 800 ns 524.459 Mbit/s + % time seconds usecs/call calls errors syscall + ------ ----------- ----------- --------- --------- ---------------- + 37.62 0.004332 13 326 233 openat + 35.46 0.004083 9 415 mmap + ... + ------ ----------- ----------- --------- --------- ---------------- + 100.00 0.011515 8 1378 251 total + +In the example above, the syscalls are listed in the right, after +the time spent on each syscall, number of calls and errors. + +The errors can be caused due to multiple reasons and may not +be a problem. To check if they were problems, strace can log the +syscalls with ``strace -o calls.log``: + +.. sourcecode:: console + + ~ns-3-dev/$ ./ns3 run "wifi-he-network --simulationTime=0.3 --frequency=5 --useRts=1 --minExpectedThroughput=6 --maxExpectedThroughput=745" --command-template "strace -o calls.log %s" --no-build + MCS value Channel width GI Throughput + 0 20 MHz 3200 ns 5.91733 Mbit/s + ... + 11 160 MHz 800 ns 524.459 Mbit/s + + +Looking at the ``calls.log`` file, we can see different sections. In the +following section, the example is executed (``execve``), architecture is checked (``arch_prctl``), +memory is mapped for execution (``mmap``) and LD_PRELOAD use is checked. + +.. sourcecode:: console + + execve("~/ns-3-dev/build/examples/wireless/ns3-dev-wifi-he-network", ["~/ns-3-dev/b"..., "--simulationTime=0.3", "--frequency=5", "--useRts=1", "--minExpectedThroughput=6", "--maxExpectedThroughput=745"], 0x7fffb0f91ad8 /* 3 vars */) = 0 + brk(NULL) = 0x563141b37000 + arch_prctl(0x3001 /* ARCH_??? */, 0x7ffff8d63a50) = -1 EINVAL (Invalid argument) + mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f103c2e9000 + access("/etc/ld.so.preload", R_OK) = -1 ENOENT (No such file or directory) + +Then the program searches for the wifi module library and fails multiple times (the errors seen in the table above). + +.. sourcecode:: console + + openat(AT_FDCWD, "~/ns-3-dev/build/lib/glibc-hwcaps/x86-64-v3/libns3-dev-wifi.so", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory) + newfstatat(AT_FDCWD, "~/ns-3-dev/build/lib/glibc-hwcaps/x86-64-v3", 0x7ffff8d62c80, 0) = -1 ENOENT (No such file or directory) + ... + openat(AT_FDCWD, "~/ns-3-dev/build/lib/x86_64/libns3-dev-wifi.so", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory) + newfstatat(AT_FDCWD, "~/ns-3-dev/build/lib/x86_64", 0x7ffff8d62c80, 0) = -1 ENOENT (No such file or directory) + +The library is finally found and its header is read: + +.. sourcecode:: console + + openat(AT_FDCWD, "~/ns-3-dev/build/lib/libns3-dev-wifi.so", O_RDONLY|O_CLOEXEC) = 3 + read(3, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0py\30\0\0\0\0\0"..., 832) = 832 + +Then other modules that wifi depends on are loaded, then execution of the program continues to the main +function of the simulation. + +Strace was used to track down issues found while running the ``lena-radio-link-failure`` example. +Its ``strace -c`` table was the following: + +.. sourcecode:: console + + % time seconds usecs/call calls errors syscall + ------ ----------- ----------- --------- --------- ---------------- + 31,51 0,246243 2 103480 942 openat + 30,23 0,236284 2 102360 write + 19,90 0,155493 1 102538 close + 16,65 0,130132 1 102426 lseek + 1,05 0,008186 18 437 mmap + 0,21 0,001671 16 99 newfstatat + 0,20 0,001595 11 134 mprotect + 0,18 0,001391 14 98 read + ... + ------ ----------- ----------- --------- --------- ---------------- + 100,00 0,781554 1 411681 951 total + +Notice the number of ``openat``, ``write``, ``close`` and ``lseek`` calls +are much more frequent than the other calls. These mean +``lena-radio-link-failure`` is opening, then seeking, then writing, +then closing at least one file handler. + +Using ``strace``, we can easily find the most frequently used file handlers. + +.. sourcecode:: console + + ~ns-3-dev/$./ns3 run "lena-radio-link-failure --numberOfEnbs=2 --useIdealRrc=0 --interSiteDistance=700 --simTime=17" --command-template="strace %s" + ... + openat(AT_FDCWD, "DlTxPhyStats.txt", O_WRONLY|O_CREAT|O_APPEND, 0666) = 3 + lseek(3, 0, SEEK_END) = 9252 + write(3, "635\t1\t1\t1\t0\t20\t1191\t0\t1\t0\n", 26) = 26 + close(3) = 0 + openat(AT_FDCWD, "DlMacStats.txt", O_WRONLY|O_CREAT|O_APPEND, 0666) = 3 + lseek(3, 0, SEEK_END) = 11100 + write(3, "0.635\t1\t1\t64\t6\t1\t20\t1191\t0\t0\t0\n", 31) = 31 + close(3) = 0 + openat(AT_FDCWD, "UlMacStats.txt", O_WRONLY|O_CREAT|O_APPEND, 0666) = 3 + lseek(3, 0, SEEK_END) = 8375 + write(3, "0.635\t1\t1\t64\t6\t1\t0\t85\t0\n", 24) = 24 + close(3) = 0 + openat(AT_FDCWD, "DlRsrpSinrStats.txt", O_WRONLY|O_CREAT|O_APPEND, 0666) = 3 + lseek(3, 0, SEEK_END) = 16058 + write(3, "0.635214\t1\t1\t1\t6.88272e-15\t22.99"..., 37) = 37 + close(3) = 0 + openat(AT_FDCWD, "UlTxPhyStats.txt", O_WRONLY|O_CREAT|O_APPEND, 0666) = 3 + ... + +.. _MR814: https://gitlab.com/nsnam/ns-3-dev/-/merge_requests/814 + +With the name of the files, we can look at the code that manipulates them. + +The issue above was found in `MR777`_, were performance for some LTE examples +regressed for no apparent reason. The flame graph below, produced by `AMD uProf`_, +contains four large columns/"flames" in red, which +correspond to the ``write``, ``openat``, ``close`` and ``lseek`` syscalls. + +.. image:: figures/uprof-strace-lte.png + +Upon closer inspection, these syscalls take a long time to complete due to +the underlying filesystem of the machine running the example (NTFS mount +using the ntfs-3g FUSE filesystem). In other words, the bottleneck only +exists when running the example in slow file systems +(e.g. FUSE and network file systems). + +The merge request `MR814`_ addressed the issue by keeping the files open +throughout the simulation. That alone resulted in a 1.75x speedup. + + +Compilation Profilers +********************* + +Compilation profilers can help identifying which steps of the compilation +are slowing it down. These profilers are built into the compilers themselves, +only requiring third-party tools to consolidate the results. + +The GCC feature is mentioned and exemplified, but is not the recommended +compilation profiling method. For that, Clang is recommended. + +GCC ++++ + +.. _not a priority : https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92396 + +GCC has a special flag ``-ftime-report``, which makes it print a table +with time spent per compilation phase for each compiled file. The printed +output for a file is shown below. The line of ``---`` was inserted for clarity. + +.. sourcecode:: console + + Time variable usr sys wall GGC + phase setup : 0.00 ( 0%) 0.00 ( 0%) 0.01 ( 1%) 1478 kB ( 2%) + phase parsing : 0.31 ( 46%) 0.17 ( 85%) 0.48 ( 55%) 55432 kB ( 71%) + phase lang. deferred : 0.03 ( 4%) 0.00 ( 0%) 0.03 ( 3%) 4287 kB ( 5%) + phase opt and generate : 0.32 ( 48%) 0.03 ( 15%) 0.35 ( 40%) 16635 kB ( 21%) + phase last asm : 0.01 ( 1%) 0.00 ( 0%) 0.01 ( 1%) 769 kB ( 1%) + ------------------------------------------------------------------------------------------------ + |name lookup : 0.05 ( 7%) 0.02 ( 10%) 0.04 ( 5%) 2468 kB ( 3%) + |overload resolution : 0.05 ( 7%) 0.00 ( 0%) 0.05 ( 6%) 4217 kB ( 5%) + dump files : 0.00 ( 0%) 0.00 ( 0%) 0.01 ( 1%) 0 kB ( 0%) + callgraph construction : 0.01 ( 1%) 0.00 ( 0%) 0.01 ( 1%) 2170 kB ( 3%) + ... + preprocessing : 0.05 ( 7%) 0.06 ( 30%) 0.10 ( 11%) 1751 kB ( 2%) + parser (global) : 0.06 ( 9%) 0.03 ( 15%) 0.07 ( 8%) 16303 kB ( 21%) + parser struct body : 0.06 ( 9%) 0.04 ( 20%) 0.08 ( 9%) 12525 kB ( 16%) + parser enumerator list : 0.01 ( 1%) 0.00 ( 0%) 0.00 ( 0%) 112 kB ( 0%) + parser function body : 0.02 ( 3%) 0.02 ( 10%) 0.02 ( 2%) 3039 kB ( 4%) + parser inl. func. body : 0.03 ( 4%) 0.00 ( 0%) 0.01 ( 1%) 2024 kB ( 3%) + parser inl. meth. body : 0.02 ( 3%) 0.01 ( 5%) 0.06 ( 7%) 5792 kB ( 7%) + template instantiation : 0.09 ( 13%) 0.01 ( 5%) 0.13 ( 15%) 12274 kB ( 16%) + ... + symout : 0.01 ( 1%) 0.00 ( 0%) 0.02 ( 2%) 8114 kB ( 10%) + ... + TOTAL : 0.67 0.20 0.88 78612 kB + +In the table above, the first few lines show the five main compilations steps: ``setup``, +``parsing``, ``lang. deferred`` (C++ specific transformations), +``opt(imize) and generate (code)``, ``last asm`` (produce binary code). + +The lines below the ``---`` line show sub-steps of the five main compilation steps. +For this specific case, parsing global definitions (21%) and structures (16%), +``template instantiation`` (16%) and generating the code in ``symout`` (10%). + +Aggregating the data into a meaningful output to help focus where to improve is not that easy +and it is `not a priority`_ for GCC developers. + +It is recommended to use the Clang alternative. + +Clang ++++++ + +.. _ClangBuildAnalyzer : https://github.com/aras-p/ClangBuildAnalyzer + +Clang can output very similar results with the ``-ftime-trace`` flag, but can also aggregate +it in a more meaningful way. With the help of the third-party tool `ClangBuildAnalyzer`_, +we can have really good insights on where to spend time trying to speed up the compilation. + +Support for building with ``-ftime-trace``, compiling `ClangBuildAnalyzer`_ and producing a +report for the project have been baked into the CMake project of ns-3, and can be enabled +with ``-DNS3_CLANG_TIMETRACE=ON``. + +.. sourcecode:: console + + ~/ns-3-dev/cmake_cache$ cmake -DNS3_CLANG_TIMETRACE=ON .. + +Or via ``ns3``: + +.. sourcecode:: console + + ~/ns-3-dev$ ./ns3 configure -- -DNS3_CLANG_TIMETRACE=ON + +The entire procedure looks like the following: + +.. sourcecode:: console + + ~/ns-3-dev$ CXX="clang++" ./ns3 configure -d release --enable-examples --enable-tests -- -DNS3_CLANG_TIMETRACE=ON + ~/ns-3-dev$ ./ns3 build timeTraceReport + ~/ns-3-dev$ cat ClangBuildAnalyzerReport.txt + Analyzing build trace from '~/ns-3-dev/cmake_cache/clangBuildAnalyzerReport.bin'... + **** Time summary: + Compilation (2993 times): + Parsing (frontend): 2476.1 s + Codegen & opts (backend): 1882.9 s + + **** Files that took longest to parse (compiler frontend): + 8966 ms: src/test/CMakeFiles/libtest.dir/traced/traced-callback-typedef-test-suite.cc.o + 6633 ms: src/wifi/examples/CMakeFiles/wifi-bianchi.dir/wifi-bianchi.cc.o + ... + + **** Files that took longest to codegen (compiler backend): + 36430 ms: src/wifi/CMakeFiles/libwifi-test.dir/test/block-ack-test-suite.cc.o + 24941 ms: src/wifi/CMakeFiles/libwifi-test.dir/test/wifi-mac-ofdma-test.cc.o + ... + + **** Templates that took longest to instantiate: + 12651 ms: std::unordered_map (615 times, avg 20 ms) + 10950 ms: std::_Hashtable, std::allocator (1172 times, avg 9 ms) + ... + + **** Template sets that took longest to instantiate: + 111660 ms: std::list<$> (27141 times, avg 4 ms) + 79892 ms: std::_List_base<$> (27140 times, avg 2 ms) + 75131 ms: std::map<$> (11752 times, avg 6 ms) + 65214 ms: std::allocator<$> (66622 times, avg 0 ms) + ... + + **** Functions that took longest to compile: + 7206 ms: OfdmaAckSequenceTest::CheckResults(ns3::Time, ns3::Time, unsigned ch... (~/ns-3-dev/src/wifi/test/wifi-mac-ofdma-test.cc) + 6146 ms: PieQueueDiscTestCase::RunPieTest(ns3::QueueSizeUnit) (~/ns-3-dev/src/traffic-control/test/pie-queue-disc-test-suite.cc) + ... + + **** Function sets that took longest to compile / optimize: + 14801 ms: std::__cxx11::basic_string<$> ns3::CallbackImplBase::GetCppTypeid<$>() (2342 times, avg 6 ms) + 12013 ms: ns3::CallbackImpl<$>::DoGetTypeid[abi:cxx11]() (1283 times, avg 9 ms) + 10034 ms: ns3::Ptr<$>::~Ptr() (5975 times, avg 1 ms) + 8932 ms: ns3::Callback<$>::DoAssign(ns3::Ptr<$>) (591 times, avg 15 ms) + 6318 ms: ns3::CallbackImpl<$>::DoGetTypeid() (431 times, avg 14 ms) + ... + + *** Expensive headers: + 293609 ms: ~/ns-3-dev/build/include/ns3/log.h (included 1404 times, avg 209 ms), included via: + cqa-ff-mac-scheduler.cc.o (758 ms) + ipv6-list-routing.cc.o (746 ms) + ... + + 239884 ms: ~/ns-3-dev/build/include/ns3/nstime.h (included 1093 times, avg 219 ms), included via: + lte-enb-rrc.cc.o lte-enb-rrc.h (891 ms) + wifi-acknowledgment.cc.o wifi-acknowledgment.h (877 ms) + ... + + 216218 ms: ~/ns-3-dev/build/include/ns3/object.h (included 1205 times, avg 179 ms), included via: + energy-source-container.cc.o energy-source-container.h energy-source.h (1192 ms) + phased-array-model.cc.o phased-array-model.h (1135 ms) + ... + + 206801 ms: ~/ns-3-dev/build/include/ns3/core-module.h (included 195 times, avg 1060 ms), included via: + sample-show-progress.cc.o (1973 ms) + length-example.cc.o (1848 ms) + ... + + 193116 ms: /usr/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/basic_string.h (included 1499 times, avg 128 ms), included via: + model-typeid-creator.h attribute-default-iterator.h type-id.h attribute.h string (250 ms) + li-ion-energy-source-helper.h energy-model-helper.h attribute.h string (243 ms) + ... + + 185075 ms: /usr/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/ios_base.h (included 1495 times, avg 123 ms), included via: + iomanip (403 ms) + mpi-test-fixtures.h iomanip (364 ms) + ... + + 169464 ms: ~/ns-3-dev/build/include/ns3/ptr.h (included 1399 times, avg 121 ms), included via: + lte-test-rlc-um-e2e.cc.o config.h (568 ms) + lte-test-rlc-um-transmitter.cc.o simulator.h event-id.h (560 ms) + ... + + + done in 2.8s. + +The output printed out contain a summary of time spent on parsing and on code generation, along +with multiple lists for different tracked categories. From the summary, it is clear that parsing times +are very high when compared to the optimization time (``-O3``). Skipping the others categories and going straight +to the expensive headers section, we can better understand why parsing times are so high, with some headers +adding as much as 5 minutes of CPU time to the parsing time. + +.. _drastically speed up parsing times : https://gitlab.com/nsnam/ns-3-dev/-/merge_requests/731#note_687176503 + +Precompiled headers (``-DNS3_PRECOMPILE_HEADERS=ON``) can `drastically speed up parsing times`_, +however, they can increase ccache misses, reducing the time of the first +compilation at the cost of increasing recompilation times. + diff --git a/doc/manual/source/working-with-gitlab-ci-local.rst b/doc/manual/source/working-with-gitlab-ci-local.rst new file mode 100644 index 000000000..30617edfd --- /dev/null +++ b/doc/manual/source/working-with-gitlab-ci-local.rst @@ -0,0 +1,166 @@ +.. include:: replace.txt +.. highlight:: bash + + +Working with gitlab-ci-local +---------------------------- + +.. _continuos integration (CI) : https://docs.gitlab.com/ee/ci/ +.. _pipelines : https://docs.gitlab.com/ee/ci/introduction/index.html#continuous-integration +.. _daily and weekly pipelines : https://gitlab.com/nsnam/ns-3-dev/-/pipeline_schedules +.. _crypto miners abuse : https://about.gitlab.com/blog/2021/05/17/prevent-crypto-mining-abuse/ +.. _GitLab-CI-local : https://github.com/firecow/gitlab-ci-local +.. _GitLab CI : https://docs.gitlab.com/ee/ci/ +.. _Docker : https://docs.docker.com/desktop/ +.. _rootless mode : https://docs.docker.com/engine/security/rootless/ + +The ns-3 project repository is currently hosted in GitLab, which includes +`continuos integration (CI)`_ tools to automate build, tests, packaging and +distribution of software. The CI works based on jobs, that are defined +on YAML files. + +The ns-3 GitLab CI files are located in ``ns-3-dev/utils/tests/``. +The main GitLab CI file is ``gitlab-ci.yml``. The different jobs +are used to check if a multitude of compilers and package versions +are compatible with the current ns-3 build, which is why a build is +usually followed by a test run. Other CI jobs build and warn about +missing the documentation. + +The GitLab CI jobs are executed based on `pipelines`_ containing a +sequence of job batches. Jobs within a batch can be executed in parallel. +These `pipelines`_ can be triggered manually, or scheduled to run automatically +per commit and/or based on a time period +(ns-3 has `daily and weekly pipelines`_ scheduled). + +The GitLab CI free tier is very slow, taking a lot of time to identify +issues during active merge request development. + +Note: the free tier +now requires a credit card due to `crypto miners abuse`_. + +`GitLab-CI-local`_ is a tool that allows an user to use the `GitLab CI`_ +configuration files locally, allowing for the debugging of CI settings +and pipelines without requiring pushes to test repositories or main +repositories that fill up the CI job queues with failed jobs due to +script errors. + +GitLab-CI-local relies on `Docker`_ to setup the environment to execute +the jobs. + +Note: Docker is usually setup in root mode, requiring +frequent use of administrative permissions/sudo. However, +this is highly discouraged. You can configure Docker to run +in `rootless mode`_. From this point onwards, we assume Docker is configured +in `rootless mode`_. + +After installing both `Docker`_ in `rootless mode`_ and `GitLab-CI-local`_, +the ns-3 jobs can be listed using the following command: + +.. sourcecode:: bash + + ~/ns-3-dev$ gitlab-ci-local --file ./utils/tests/gitlab-ci.yml --list + parsing and downloads finished in 226 ms + name description stage when allow_failure needs + weekly-build-ubuntu-18.04-debug build on_success false + + ... + + weekly-build-clang-11-optimized build on_success false + pybindgen build on_success false + per-commit-compile-debug build on_success false + per-commit-compile-release build on_success false + per-commit-compile-optimized build on_success false + daily-test-debug test on_success false + daily-test-release test on_success false + daily-test-optimized test on_success false + daily-test-optimized-valgrind test on_success false + weekly-test-debug-valgrind test on_success false + weekly-test-release-valgrind test on_success false + weekly-test-optimized-valgrind test on_success false + weekly-test-takes-forever-optimized test on_success false + doxygen documentation on_success false + manual documentation on_success false + tutorial documentation on_success false + models documentation on_success false + +To execute the ``per-commit-compile-release`` job, or any of the others listed above, use +the following command. + +.. sourcecode:: console + + ~/ns-3-dev$ gitlab-ci-local --file ./utils/tests/gitlab-ci.yml per-commit-compile-release + +WARNING: if you do not specify the job name, all jobs that can be executed in parallel will be +executed at the same time. You may run out of disk, memory or both. + +Some jobs might require a previous job to complete successfully before getting started. +The doxygen job is one of these. + +.. sourcecode:: console + + ~/ns-3-dev$ gitlab-ci-local --file ./utils/tests/gitlab-ci.yml doxygen + Using fallback git user.name + Using fallback git user.email + parsing and downloads finished in 202 ms + doxygen starting archlinux:latest (documentation) + doxygen pulled archlinux:latest in 64 ms + doxygen > still running... + doxygen > still running... + doxygen copied to container in 20 s + doxygen imported cache 'ccache-' in 3.67 s + ~/ns-3-dev/.gitlab-ci-local/artifacts/pybindgen doesn't exist, did you forget --needs + +As instructed by the previous command output, you can add the ``--needs`` to build required +jobs before proceeding. However, doing so will run all jobs as the ``doxygen`` is only supposed +to run after weekly jobs are successfully executed. + +Another option is to run the specific job that produces the required artifact. In this case +the ``pybindgen`` job. + +.. sourcecode:: console + + ~/ns-3-dev$ gitlab-ci-local --file ./utils/tests/gitlab-ci.yml pybindgen + Using fallback git user.name + Using fallback git user.email + parsing and downloads finished in 202 ms + pybindgen starting archlinux:latest (build) + + ... + + pybindgen $ git diff src > pybindgen_new.patch + pybindgen exported artifacts in 911 ms + pybindgen copied artifacts to cwd in 56 ms + pybindgen finished in 5.77 min + + PASS pybindgen + +Then run the doxygen job again: + +.. sourcecode:: bash + + ~/ns-3-dev$ gitlab-ci-local --file ./utils/tests/gitlab-ci.yml doxygen + Using fallback git user.name + Using fallback git user.email + parsing and downloads finished in 170 ms + doxygen starting archlinux:latest (documentation) + + ... + + doxygen > 1 files with warnings + doxygen > Doxygen Warnings Summary + doxygen > ---------------------------------------- + doxygen > 1 directories + doxygen > 1 files + doxygen > 23 warnings + doxygen > done. + doxygen exported cache ns-3-ccache-storage/ 'ccache-' in 6.86 s + doxygen exported artifacts in 954 ms + doxygen copied artifacts to cwd in 59 ms + doxygen finished in 15 min + + PASS doxygen + +Artifacts built by the CI jobs will be stored in separate subfolders +based on the job name. + +``~/ns-3-dev/.gitlab-ci-local/artifacts/jobname``