From ed5a19f22710831a0225ca69fee59df86869db5a Mon Sep 17 00:00:00 2001 From: "Peter D. Barnes, Jr" Date: Thu, 15 Oct 2020 15:56:05 -0700 Subject: [PATCH] core: (merges !442) Add CsvReader for parsing of csv- or tab-delimited data --- src/core/helper/csv-reader.cc | 480 ++++++++++++++++++++++++++++++++++ src/core/helper/csv-reader.h | 429 ++++++++++++++++++++++++++++++ src/core/wscript | 2 + 3 files changed, 911 insertions(+) create mode 100644 src/core/helper/csv-reader.cc create mode 100644 src/core/helper/csv-reader.h diff --git a/src/core/helper/csv-reader.cc b/src/core/helper/csv-reader.cc new file mode 100644 index 000000000..845bd7d65 --- /dev/null +++ b/src/core/helper/csv-reader.cc @@ -0,0 +1,480 @@ +/* -*- Mode:C++; c-file-style:"gnu"; indent-tabs-mode:nil; -*- */ +/* + * Copyright (c) 2019 Lawrence Livermore National Laboratory + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation; + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Mathew Bielejeski + */ + +#include "csv-reader.h" + +#include "ns3/log.h" + +#include +#include +#include +#include +#include +#include + +/** + * \file + * \ingroup core + * \ingroup csvreader + * + * ns3::CsvReader implementation + */ + +NS_LOG_COMPONENT_DEFINE ("CsvReader"); + +namespace { + +/** + * Convert a string into another type. + * + * Uses a stringstream to deserialize the value stored in \p input + * to a value of type T and writes the deserialized value to \p output. + * + * \tparam T Data type of output. + * \param input String containing serialized data. + * \param output Place to store deserialized value. + * + * \return \c true if deserialization was successful, \c false otherwise. + */ +template +bool GenericTransform (std::string input, T& output) +{ + NS_LOG_FUNCTION (input); + + std::istringstream stream (input); + + stream >> output; + + return static_cast (stream); +} + +} // unnamed namespace + +namespace ns3 { + +CsvReader::CsvReader (const std::string& filepath, char delimiter /* =',' */) + : m_delimiter (delimiter), + m_rowsRead (0), + m_fileStream (filepath), + m_stream (&m_fileStream) +{ + NS_LOG_FUNCTION (this << filepath); +} + +CsvReader::CsvReader (std::istream& stream, char delimiter /* =',' */) + : m_delimiter (delimiter), + m_rowsRead (0), + m_fileStream (), + m_stream (&stream) +{ + NS_LOG_FUNCTION (this); +} + +CsvReader::~CsvReader () +{} + +std::size_t +CsvReader::ColumnCount () const +{ + NS_LOG_FUNCTION (this); + + return m_columns.size (); +} + +std::size_t +CsvReader::RowNumber () const +{ + NS_LOG_FUNCTION (this); + + return m_rowsRead; +} + +char +CsvReader::Delimiter () const +{ + NS_LOG_FUNCTION (this); + + return m_delimiter; +} + +bool +CsvReader::FetchNextRow () +{ + NS_LOG_FUNCTION (this); + + std::string line; + + if ( m_stream->eof () ) + { + NS_LOG_LOGIC ("Reached end of stream"); + return false; + } + + NS_LOG_LOGIC ("Reading line " << m_rowsRead + 1); + + std::getline (*m_stream, line); + + if ( m_stream->fail () ) + { + NS_LOG_ERROR ("Reading line " << m_rowsRead + 1 << " failed"); + + return false; + } + + ++m_rowsRead; + + ParseLine (line); + + return true; +} + +bool +CsvReader::IsBlankRow () const +{ + return m_blankRow; +} + +bool +CsvReader::GetValueAs (std::string input, double& value) const +{ + NS_LOG_FUNCTION (this << input); + + return GenericTransform (std::move (input), value); +} + +bool +CsvReader::GetValueAs (std::string input, float& value) const +{ + NS_LOG_FUNCTION (this << input); + + return GenericTransform (std::move (input), value); +} + +bool +CsvReader::GetValueAs (std::string input, signed char& value) const +{ + typedef signed char byte_type; + + NS_LOG_FUNCTION (this << input); + + std::istringstream tempStream (input); + + std::int16_t tempOutput = 0; + tempStream >> tempOutput; + + if (tempOutput >= std::numeric_limits::min () + || tempOutput <= std::numeric_limits::max () ) + { + value = static_cast (tempOutput); + } + + bool success = static_cast (tempStream); + + NS_LOG_DEBUG ("Input='" << input + << "', output=" << tempOutput + << ", result=" << success); + + return success; +} + +bool +CsvReader::GetValueAs (std::string input, short& value) const +{ + NS_LOG_FUNCTION (this << input); + + return GenericTransform (std::move (input), value); +} + +bool +CsvReader::GetValueAs (std::string input, int& value) const +{ + NS_LOG_FUNCTION (this << input); + + return GenericTransform (std::move (input), value); +} + +bool +CsvReader::GetValueAs (std::string input, long& value) const +{ + NS_LOG_FUNCTION (this << input); + + return GenericTransform (std::move (input), value); +} + +bool +CsvReader::GetValueAs (std::string input, long long& value) const +{ + NS_LOG_FUNCTION (this << input); + + return GenericTransform (std::move (input), value); +} + +bool +CsvReader::GetValueAs (std::string input, std::string& value) const +{ + NS_LOG_FUNCTION (this << input); + + value = input; + + return true; +} + +bool +CsvReader::GetValueAs (std::string input, unsigned char& value) const +{ + NS_LOG_FUNCTION (this << input); + + typedef unsigned char byte_type; + + NS_LOG_FUNCTION (this << input); + + std::istringstream tempStream (input); + + std::uint16_t tempOutput = 0; + tempStream >> tempOutput; + + if (tempOutput >= std::numeric_limits::min () + || tempOutput <= std::numeric_limits::max () ) + { + value = static_cast (tempOutput); + } + + bool success = static_cast (tempStream); + + NS_LOG_DEBUG ("Input='" << input + << "', output=" << tempOutput + << ", result=" << success); + + return success; +} + +bool +CsvReader::GetValueAs (std::string input, unsigned short& value) const +{ + NS_LOG_FUNCTION (this << input); + + return GenericTransform (std::move (input), value); +} + +bool +CsvReader::GetValueAs (std::string input, unsigned int& value) const +{ + NS_LOG_FUNCTION (this << input); + + return GenericTransform (std::move (input), value); +} + +bool +CsvReader::GetValueAs (std::string input, unsigned long& value) const +{ + NS_LOG_FUNCTION (this << input); + + return GenericTransform (std::move (input), value); +} + +bool +CsvReader::GetValueAs (std::string input, unsigned long long& value) const +{ + NS_LOG_FUNCTION (this << input); + + return GenericTransform (std::move (input), value); +} + +bool +CsvReader::IsDelimiter (char c) const +{ + NS_LOG_FUNCTION (this << c); + + return c == m_delimiter; +} + +void +CsvReader::ParseLine (const std::string& line) +{ + NS_LOG_FUNCTION (this << line); + + std::string value; + m_columns.clear (); + + auto start_col = line.begin (); + auto end_col = line.end (); + + while ( start_col != line.end () ) + { + std::tie (value, end_col) = ParseColumn (start_col, line.end ()); + + NS_LOG_DEBUG ("ParseColumn() returned: " << value); + + m_columns.push_back (std::move (value)); + + if ( end_col != line.end () ) + { + ++end_col; + } + + start_col = end_col; + } + m_blankRow = (m_columns.size () == 1) && (m_columns[0] == ""); + NS_LOG_LOGIC ("blank row: " << m_blankRow); +} + +std::tuple +CsvReader::ParseColumn (std::string::const_iterator begin, std::string::const_iterator end) +{ + NS_LOG_FUNCTION (this << std::string (begin, end)); + + enum class State + { + BEGIN, + END_QUOTE, + FIND_DELIMITER, + QUOTED_STRING, + UNQUOTED_STRING, + END + }; + + State state = State::BEGIN; + std::string buffer; + auto iter = begin; + + while (state != State::END) + { + if (iter == end) + { + NS_LOG_DEBUG ("Found end iterator, switching to END state"); + + state = State::END; + continue; + } + + auto c = *iter; + + NS_LOG_DEBUG ("Next character: '" << c << "'"); + + //handle common cases here to avoid duplicating logic + if (state != State::QUOTED_STRING) + { + if (IsDelimiter (c)) + { + NS_LOG_DEBUG ("Found field delimiter, switching to END state"); + + if ( state == State::UNQUOTED_STRING ) + { + NS_LOG_DEBUG ("Removing trailing whitespace from unquoted field: '" << buffer << "'"); + auto len = buffer.size (); + + //remove trailing whitespace from the field + while ( !buffer.empty () + && std::isspace (static_cast (buffer.back ())) ) + { + buffer.pop_back (); + } + + auto finalLen = buffer.size (); + + NS_LOG_DEBUG ("Removed " << (len - finalLen) << " trailing whitespace characters"); + } + + state = State::END; + + continue; + } + else if (c == '#') + { + NS_LOG_DEBUG ("Found start of comment, switching to END state"); + + //comments consume the rest of the line, set iter to end + //to reflect that fact. + iter = end; + state = State::END; + + continue; + } + } + + switch (state) + { + case State::BEGIN: + { + if (c == '"') + { + NS_LOG_DEBUG ("Switching state: BEGIN -> QUOTED_STRING"); + + state = State::QUOTED_STRING; + } + else if (!std::isspace (c)) + { + NS_LOG_DEBUG ("Switching state: BEGIN -> UNQUOTED_STRING"); + + state = State::UNQUOTED_STRING; + buffer.push_back (c); + } + + } break; + case State::QUOTED_STRING: + { + if (c == '"') + { + NS_LOG_DEBUG ("Switching state: QUOTED_STRING -> END_QUOTE"); + state = State::END_QUOTE; + } + else + { + buffer.push_back (c); + } + + } break; + case State::END_QUOTE: + { + if (c == '"') + { + NS_LOG_DEBUG ("Switching state: END_QUOTE -> QUOTED_STRING" ); + + //an escape quote instead of an end quote + state = State::QUOTED_STRING; + buffer.push_back (c); + } + else + { + NS_LOG_DEBUG ("Switching state: END_QUOTE -> FIND_DELIMITER" ); + state = State::FIND_DELIMITER; + } + + } break; + case State::UNQUOTED_STRING: + { + buffer.push_back (c); + } break; + case State::FIND_DELIMITER: + break; + case State::END: + break; + } + + ++iter; + } + + NS_LOG_DEBUG ("Field value: " << buffer); + + return std::make_tuple (buffer, iter); +} + +} // namespace ns3 + diff --git a/src/core/helper/csv-reader.h b/src/core/helper/csv-reader.h new file mode 100644 index 000000000..064ca88b1 --- /dev/null +++ b/src/core/helper/csv-reader.h @@ -0,0 +1,429 @@ +/* -*- Mode:C++; c-file-style:"gnu"; indent-tabs-mode:nil; -*- */ +/* + * Copyright (c) 2019 Lawrence Livermore National Laboratory + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation; + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Mathew Bielejeski + */ + +#ifndef NS3_CSV_READER_H_ +#define NS3_CSV_READER_H_ + +#include +#include +#include +#include +#include +#include + +/** + * \file + * \ingroup core + * \ingroup csvreader + * + * ns3::CsvReader declaration + * + */ +namespace ns3 { + +/** + * \ingroup core + * \defgroup csvreader + * + * A way to extract data from simple csv files. + */ + +// *NS_CHECK_STYLE_OFF* Style checker trims blank lines in code blocks + +/** + * \ingroup csvreader + * + * Provides functions for parsing and extracting data from + * Comma Separated Value (CSV) formatted text files. + * This parser is somewhat more relaxed than \RFC{4180}; + * see below for a list of the differences. + * In particular it is possible to set the delimiting character at construction, + * enabling parsing of tab-delimited streams or other formats with delimiters. + * + * \note Excel may generate "CSV" files with either ',' or ';' delimiter + * depending on the locale: if ',' is the decimal mark then ';' is the list + * separator and used to read/write "CSV" files. + * + * To use this facility, construct a CsvReader from either a file path + * or \c std::istream, then FetchNextRow(), and finally GetValue() + * to extract specific values from the row. + * + * For example: + * \code + * CsvReader csv (filePath); + * while (csv.FetchNextRow ()) + * { + * // Ignore blank lines + * if (csv.IsBlankLine ()) + * { + * continue; + * } + * + * // Expecting three values + * double x, y, z; + * bool ok = csv.GetValue (0, x); + * ok |= csv.GetValue (1, y); + * ok |= csv.GetValue (2, z); + * if (!ok) + * { + * // Handle error, then + * continue; + * } + * + * // Do something with values + * + * } // while FetchNextRow + * \endcode + * + * As another example, supposing we need a vector from each row, + * the middle of the previous example would become: + * \code + * std::vector v (n); + * bool ok = true; + * for (std::size_t i = 0; i < v.size (); ++i) + * { + * ok |= csv.GetValue (i, v[i]); + * } + * if (!ok) ... + * \endcode + * + * + * File Format + * =========== + * + * This parser implements \RFC{4180}, but with several restrictions removed; + * see below for differences. All the formatting features described next + * are illustrated in the examples which which follow. + * + * Comments + * -------- + * + * The hash character (#) is used to indicate the start of a comment. Comments + * are not parsed by the reader. Comments are treated as either an empty column + * or part of an existing column depending on where the comment is located. + * Comments that are found at the end of a line containing data are ignored. + * + * 1,2 # This comment ignored, leaving two data columns + * + * Lines that contain a comment and no data are treated as rows with a single + * empty column, meaning that ColumnCount will return 1 and + * GetValue() will return an empty string. + * + * # This row treated as a single empty column, returning an empty string. + * "" # So is this + * + * IsBlankLine() will return \c true in either of these cases. + * + * Quoted Columns + * -------------- + * + * Columns with string data which contain the delimiter character or + * the hash character can be wrapped in double quotes to prevent CsvReader + * from treating them as special characters. + * + * 3,string without delimiter,"String with comma ',' delimiter" + * + * Double quotes can be escaped + * by doubling up the quotes inside a quoted field. See example 6 below for + * a demonstration. + * + * Whitespace + * ---------- + * + * Leading and trailing whitespace are ignored by the reader and are not + * stored in the column data. + * + * 4,5 , 6 # Columns contain '4', '5', '6' + * + * If leading or trailing whitespace are important + * for a column, wrap the column in double quotes as discussed above. + * + * 7,"8 "," 9" # Columns contain '7', '8 ', ' 9' + * + * Trailing Delimiter + * ------------------ + * + * Trailing delimiters are ignored; they do _not_ result in an empty column. + * + * + * Differences from RFC 4180 + * ------------------------- + * Section 2.1 + * - Line break can be LF or CRLF + * + * Section 2.3 + * - Non-parsed lines are allowed anywhere, not just as a header. + * - Lines do not all have to contain the same number fields. + * + * Section 2.4 + * - Characters other than comma can be used to separate fields. + * - Lines do not all have to contain the same number fields. + * - Leading/trailing spaces are stripped from the field + * unless the whitespace is wrapped in double quotes. + * - A trailing delimiter on a line is not an error. + * + * Section 2.6 + * - Quoted fields cannot contain line breaks + * + * Examples + * -------- + * \par Example 1: Basic + * \code + * # Column 1: Product + * # Column 2: Price + * widget, 12.5 + * \endcode + * + * \par Example 2: Comment at end of line + * \code + * # Column 1: Product + * # Column 2: Price + * broken widget, 12.5 # this widget is broken + * \endcode + * + * \par Example 3: Delimiter in double quotes + * \code + * # Column 1: Product + * # Column 2: Price + * # Column 3: Count + * # Column 4: Date + * widget, 12.5, 100, "November 6, 2018" + * \endcode + * + * \par # Example 4: Hash character in double quotes + * \code + * # Column 1: Key + * # Column 2: Value + * # Column 3: Description + * count, 5, "# of widgets currently in stock" + * \endcode + * + * \par Example 5: Extra whitespace + * \code + * # Column 1: Key + * # Column 2: Value + * # Column 3: Description + * count , 5 ,"# of widgets in stock" + * \endcode + * + * \par Example 6: Escaped quotes + * \code + * # Column 1: Key + * # Column 2: Description + * # The value returned for Column 2 will be: String with "embedded" quotes + * foo, "String with ""embedded"" quotes" + * \endcode + */ +// *NS_CHECK_STYLE_ON* +class CsvReader +{ +public: + /** + * Constructor + * + * Opens the file specified in the filepath argument and + * reads data from it. + * + * \param filepath Path to a file containing CSV data. + * \param delimiter Character used to separate fields in the data file. + */ + CsvReader (const std::string& filepath, char delimiter = ','); + + /** + * Constructor + * + * Reads csv data from the supplied input stream. + * + * \param stream Input stream containing csv data. + * \param delimiter Character used to separate fields in the data stream. + */ + CsvReader (std::istream& stream, char delimiter = ','); + + /** + * Destructor + */ + virtual ~CsvReader (); + + /** + * Returns the number of columns in the csv data. + * + * \return Number of columns + */ + std::size_t ColumnCount () const; + + /** + * The number of lines that have been read. + * + * \return The number of lines that have been read. + */ + std::size_t RowNumber () const; + + /** + * Returns the delimiter character specified during object construction. + * + * \return Character used as the column separator. + */ + char Delimiter () const; + + /** + * Reads one line from the input until a new line is encountered. + * The read data is stored in a cache which is accessed by the + * GetValue functions to extract fields from the data. + * + * \return \c true if a line was read successfully or \c false if the + * read failed or reached the end of the file. + */ + bool FetchNextRow (); + + /** + * Attempt to convert from the string data in the specified column + * to the specified data type. + * + * \tparam T The data type of the output variable. + * + * \param [in] columnIndex Index of the column to fetch. + * \param [out] value Location where the converted data will be stored. + * + * \return \c true if the specified column has data and the data + * was converted to the specified data type. + */ + template + bool GetValue (std::size_t columnIndex, T& value) const; + + /** + * Check if the current row is blank. + * A blank row can consist of any combination of + * + * - Whitespace + * - Comment + * - Quoted empty string `""` + * + * \returns \c true if the input row is a blank line. + */ + bool IsBlankRow () const; + +private: + /** + * Attempt to convert from the string data stored at the specified column + * index into the specified type. + * + * \param input [in] String value to be converted. + * \param value [out] Location where the converted value will be stored. + * + * \return \c true if the column exists and the conversion succeeded, + * \c false otherwise. + */ + /** @{ */ + bool GetValueAs (std::string input, double& value) const; + + bool GetValueAs (std::string input, float& value) const; + + bool GetValueAs (std::string input, signed char& value) const; + + bool GetValueAs (std::string input, short& value) const; + + bool GetValueAs (std::string input, int& value) const; + + bool GetValueAs (std::string input, long& value) const; + + bool GetValueAs (std::string input, long long& value) const; + + bool GetValueAs (std::string input, std::string& value) const; + + bool GetValueAs (std::string input, unsigned char& value) const; + + bool GetValueAs (std::string input, unsigned short& value) const; + + bool GetValueAs (std::string input, unsigned int& value) const; + + bool GetValueAs (std::string input, unsigned long& value) const; + + bool GetValueAs (std::string input, unsigned long long& value) const; + /** @} */ + + /** + * Returns \c true if the supplied character matches the delimiter. + * + * \param c Character to check. + * \return \c true if \pname{c} is the delimiter character, + * \c false otherwise. + */ + bool IsDelimiter (char c) const; + + /** + * Scans the string and splits it into individual columns based on the delimiter. + * + * \param [in] line String containing delimiter separated data. + */ + void ParseLine (const std::string& line); + + /** + * Extracts the data for one column in a csv row. + * + * \param begin Iterator to the first character in the row. + * \param end Iterator to the last character in the row. + * \return A tuple containing the content of the column and an iterator + * pointing to the position in the row where the column ended. + */ + std::tuple + ParseColumn (std::string::const_iterator begin, std::string::const_iterator end); + + /** + * Container of CSV data. Each entry represents one field in a row + * of data. The fields are stored in the same order that they are + * encountered in the CSV data. + */ + typedef std::vector Columns; + + char m_delimiter; //!< Character used to separate fields. + std::size_t m_rowsRead; //!< Number of lines processed. + Columns m_columns; //!< Fields extracted from the current line. + bool m_blankRow; //!< Line contains no data (blank line or comment only). + std::ifstream m_fileStream; //!< File stream containing the data. + + /** + * Pointer to the input stream containing the data. + */ + std::istream* m_stream; + +}; // class CsvReader + + +/**************************************************** + * Template implementations. + ***************************************************/ + +template +bool +CsvReader::GetValue (std::size_t columnIndex, T& value) const +{ + if ( columnIndex >= ColumnCount () ) + { + return false; + } + + std::string cell = m_columns[columnIndex]; + + return GetValueAs (std::move (cell), value); +} + +} // namespace ns3 + +#endif // NS3_CSV_READER_H_ diff --git a/src/core/wscript b/src/core/wscript index df84c0f01..500f1d2e7 100644 --- a/src/core/wscript +++ b/src/core/wscript @@ -235,6 +235,7 @@ def build(bld): 'model/time-printer.cc', 'model/show-progress.cc', 'model/system-wall-clock-timestamp.cc', + 'helper/csv-reader.cc', ] if (bld.env['ENABLE_EXAMPLES']): @@ -368,6 +369,7 @@ def build(bld): 'model/node-printer.h', 'model/time-printer.h', 'model/show-progress.h', + 'helper/csv-reader.h', ] if (bld.env['ENABLE_EXAMPLES']):