core: (merges !442) Add CsvReader for parsing of csv- or tab-delimited data

This commit is contained in:
Peter D. Barnes, Jr
2020-10-15 15:56:05 -07:00
committed by Tom Henderson
parent a43d6daf87
commit ed5a19f227
3 changed files with 911 additions and 0 deletions

View File

@@ -0,0 +1,480 @@
/* -*- Mode:C++; c-file-style:"gnu"; indent-tabs-mode:nil; -*- */
/*
* Copyright (c) 2019 Lawrence Livermore National Laboratory
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation;
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Author: Mathew Bielejeski <bielejeski1@llnl.gov>
*/
#include "csv-reader.h"
#include "ns3/log.h"
#include <algorithm>
#include <cctype>
#include <fstream>
#include <iterator>
#include <sstream>
#include <vector>
/**
* \file
* \ingroup core
* \ingroup csvreader
*
* ns3::CsvReader implementation
*/
NS_LOG_COMPONENT_DEFINE ("CsvReader");
namespace {
/**
* Convert a string into another type.
*
* Uses a stringstream to deserialize the value stored in \p input
* to a value of type T and writes the deserialized value to \p output.
*
* \tparam T Data type of output.
* \param input String containing serialized data.
* \param output Place to store deserialized value.
*
* \return \c true if deserialization was successful, \c false otherwise.
*/
template<typename T>
bool GenericTransform (std::string input, T& output)
{
NS_LOG_FUNCTION (input);
std::istringstream stream (input);
stream >> output;
return static_cast<bool> (stream);
}
} // unnamed namespace
namespace ns3 {
CsvReader::CsvReader (const std::string& filepath, char delimiter /* =',' */)
: m_delimiter (delimiter),
m_rowsRead (0),
m_fileStream (filepath),
m_stream (&m_fileStream)
{
NS_LOG_FUNCTION (this << filepath);
}
CsvReader::CsvReader (std::istream& stream, char delimiter /* =',' */)
: m_delimiter (delimiter),
m_rowsRead (0),
m_fileStream (),
m_stream (&stream)
{
NS_LOG_FUNCTION (this);
}
CsvReader::~CsvReader ()
{}
std::size_t
CsvReader::ColumnCount () const
{
NS_LOG_FUNCTION (this);
return m_columns.size ();
}
std::size_t
CsvReader::RowNumber () const
{
NS_LOG_FUNCTION (this);
return m_rowsRead;
}
char
CsvReader::Delimiter () const
{
NS_LOG_FUNCTION (this);
return m_delimiter;
}
bool
CsvReader::FetchNextRow ()
{
NS_LOG_FUNCTION (this);
std::string line;
if ( m_stream->eof () )
{
NS_LOG_LOGIC ("Reached end of stream");
return false;
}
NS_LOG_LOGIC ("Reading line " << m_rowsRead + 1);
std::getline (*m_stream, line);
if ( m_stream->fail () )
{
NS_LOG_ERROR ("Reading line " << m_rowsRead + 1 << " failed");
return false;
}
++m_rowsRead;
ParseLine (line);
return true;
}
bool
CsvReader::IsBlankRow () const
{
return m_blankRow;
}
bool
CsvReader::GetValueAs (std::string input, double& value) const
{
NS_LOG_FUNCTION (this << input);
return GenericTransform (std::move (input), value);
}
bool
CsvReader::GetValueAs (std::string input, float& value) const
{
NS_LOG_FUNCTION (this << input);
return GenericTransform (std::move (input), value);
}
bool
CsvReader::GetValueAs (std::string input, signed char& value) const
{
typedef signed char byte_type;
NS_LOG_FUNCTION (this << input);
std::istringstream tempStream (input);
std::int16_t tempOutput = 0;
tempStream >> tempOutput;
if (tempOutput >= std::numeric_limits<byte_type>::min ()
|| tempOutput <= std::numeric_limits<byte_type>::max () )
{
value = static_cast<byte_type> (tempOutput);
}
bool success = static_cast<bool> (tempStream);
NS_LOG_DEBUG ("Input='" << input
<< "', output=" << tempOutput
<< ", result=" << success);
return success;
}
bool
CsvReader::GetValueAs (std::string input, short& value) const
{
NS_LOG_FUNCTION (this << input);
return GenericTransform (std::move (input), value);
}
bool
CsvReader::GetValueAs (std::string input, int& value) const
{
NS_LOG_FUNCTION (this << input);
return GenericTransform (std::move (input), value);
}
bool
CsvReader::GetValueAs (std::string input, long& value) const
{
NS_LOG_FUNCTION (this << input);
return GenericTransform (std::move (input), value);
}
bool
CsvReader::GetValueAs (std::string input, long long& value) const
{
NS_LOG_FUNCTION (this << input);
return GenericTransform (std::move (input), value);
}
bool
CsvReader::GetValueAs (std::string input, std::string& value) const
{
NS_LOG_FUNCTION (this << input);
value = input;
return true;
}
bool
CsvReader::GetValueAs (std::string input, unsigned char& value) const
{
NS_LOG_FUNCTION (this << input);
typedef unsigned char byte_type;
NS_LOG_FUNCTION (this << input);
std::istringstream tempStream (input);
std::uint16_t tempOutput = 0;
tempStream >> tempOutput;
if (tempOutput >= std::numeric_limits<byte_type>::min ()
|| tempOutput <= std::numeric_limits<byte_type>::max () )
{
value = static_cast<byte_type> (tempOutput);
}
bool success = static_cast<bool> (tempStream);
NS_LOG_DEBUG ("Input='" << input
<< "', output=" << tempOutput
<< ", result=" << success);
return success;
}
bool
CsvReader::GetValueAs (std::string input, unsigned short& value) const
{
NS_LOG_FUNCTION (this << input);
return GenericTransform (std::move (input), value);
}
bool
CsvReader::GetValueAs (std::string input, unsigned int& value) const
{
NS_LOG_FUNCTION (this << input);
return GenericTransform (std::move (input), value);
}
bool
CsvReader::GetValueAs (std::string input, unsigned long& value) const
{
NS_LOG_FUNCTION (this << input);
return GenericTransform (std::move (input), value);
}
bool
CsvReader::GetValueAs (std::string input, unsigned long long& value) const
{
NS_LOG_FUNCTION (this << input);
return GenericTransform (std::move (input), value);
}
bool
CsvReader::IsDelimiter (char c) const
{
NS_LOG_FUNCTION (this << c);
return c == m_delimiter;
}
void
CsvReader::ParseLine (const std::string& line)
{
NS_LOG_FUNCTION (this << line);
std::string value;
m_columns.clear ();
auto start_col = line.begin ();
auto end_col = line.end ();
while ( start_col != line.end () )
{
std::tie (value, end_col) = ParseColumn (start_col, line.end ());
NS_LOG_DEBUG ("ParseColumn() returned: " << value);
m_columns.push_back (std::move (value));
if ( end_col != line.end () )
{
++end_col;
}
start_col = end_col;
}
m_blankRow = (m_columns.size () == 1) && (m_columns[0] == "");
NS_LOG_LOGIC ("blank row: " << m_blankRow);
}
std::tuple<std::string, std::string::const_iterator>
CsvReader::ParseColumn (std::string::const_iterator begin, std::string::const_iterator end)
{
NS_LOG_FUNCTION (this << std::string (begin, end));
enum class State
{
BEGIN,
END_QUOTE,
FIND_DELIMITER,
QUOTED_STRING,
UNQUOTED_STRING,
END
};
State state = State::BEGIN;
std::string buffer;
auto iter = begin;
while (state != State::END)
{
if (iter == end)
{
NS_LOG_DEBUG ("Found end iterator, switching to END state");
state = State::END;
continue;
}
auto c = *iter;
NS_LOG_DEBUG ("Next character: '" << c << "'");
//handle common cases here to avoid duplicating logic
if (state != State::QUOTED_STRING)
{
if (IsDelimiter (c))
{
NS_LOG_DEBUG ("Found field delimiter, switching to END state");
if ( state == State::UNQUOTED_STRING )
{
NS_LOG_DEBUG ("Removing trailing whitespace from unquoted field: '" << buffer << "'");
auto len = buffer.size ();
//remove trailing whitespace from the field
while ( !buffer.empty ()
&& std::isspace (static_cast<unsigned char> (buffer.back ())) )
{
buffer.pop_back ();
}
auto finalLen = buffer.size ();
NS_LOG_DEBUG ("Removed " << (len - finalLen) << " trailing whitespace characters");
}
state = State::END;
continue;
}
else if (c == '#')
{
NS_LOG_DEBUG ("Found start of comment, switching to END state");
//comments consume the rest of the line, set iter to end
//to reflect that fact.
iter = end;
state = State::END;
continue;
}
}
switch (state)
{
case State::BEGIN:
{
if (c == '"')
{
NS_LOG_DEBUG ("Switching state: BEGIN -> QUOTED_STRING");
state = State::QUOTED_STRING;
}
else if (!std::isspace (c))
{
NS_LOG_DEBUG ("Switching state: BEGIN -> UNQUOTED_STRING");
state = State::UNQUOTED_STRING;
buffer.push_back (c);
}
} break;
case State::QUOTED_STRING:
{
if (c == '"')
{
NS_LOG_DEBUG ("Switching state: QUOTED_STRING -> END_QUOTE");
state = State::END_QUOTE;
}
else
{
buffer.push_back (c);
}
} break;
case State::END_QUOTE:
{
if (c == '"')
{
NS_LOG_DEBUG ("Switching state: END_QUOTE -> QUOTED_STRING" );
//an escape quote instead of an end quote
state = State::QUOTED_STRING;
buffer.push_back (c);
}
else
{
NS_LOG_DEBUG ("Switching state: END_QUOTE -> FIND_DELIMITER" );
state = State::FIND_DELIMITER;
}
} break;
case State::UNQUOTED_STRING:
{
buffer.push_back (c);
} break;
case State::FIND_DELIMITER:
break;
case State::END:
break;
}
++iter;
}
NS_LOG_DEBUG ("Field value: " << buffer);
return std::make_tuple (buffer, iter);
}
} // namespace ns3

View File

@@ -0,0 +1,429 @@
/* -*- Mode:C++; c-file-style:"gnu"; indent-tabs-mode:nil; -*- */
/*
* Copyright (c) 2019 Lawrence Livermore National Laboratory
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation;
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Author: Mathew Bielejeski <bielejeski1@llnl.gov>
*/
#ifndef NS3_CSV_READER_H_
#define NS3_CSV_READER_H_
#include <cstddef>
#include <cstdint>
#include <fstream>
#include <istream>
#include <string>
#include <vector>
/**
* \file
* \ingroup core
* \ingroup csvreader
*
* ns3::CsvReader declaration
*
*/
namespace ns3 {
/**
* \ingroup core
* \defgroup csvreader
*
* A way to extract data from simple csv files.
*/
// *NS_CHECK_STYLE_OFF* Style checker trims blank lines in code blocks
/**
* \ingroup csvreader
*
* Provides functions for parsing and extracting data from
* Comma Separated Value (CSV) formatted text files.
* This parser is somewhat more relaxed than \RFC{4180};
* see below for a list of the differences.
* In particular it is possible to set the delimiting character at construction,
* enabling parsing of tab-delimited streams or other formats with delimiters.
*
* \note Excel may generate "CSV" files with either ',' or ';' delimiter
* depending on the locale: if ',' is the decimal mark then ';' is the list
* separator and used to read/write "CSV" files.
*
* To use this facility, construct a CsvReader from either a file path
* or \c std::istream, then FetchNextRow(), and finally GetValue()
* to extract specific values from the row.
*
* For example:
* \code
* CsvReader csv (filePath);
* while (csv.FetchNextRow ())
* {
* // Ignore blank lines
* if (csv.IsBlankLine ())
* {
* continue;
* }
*
* // Expecting three values
* double x, y, z;
* bool ok = csv.GetValue (0, x);
* ok |= csv.GetValue (1, y);
* ok |= csv.GetValue (2, z);
* if (!ok)
* {
* // Handle error, then
* continue;
* }
*
* // Do something with values
*
* } // while FetchNextRow
* \endcode
*
* As another example, supposing we need a vector from each row,
* the middle of the previous example would become:
* \code
* std::vector<double> v (n);
* bool ok = true;
* for (std::size_t i = 0; i < v.size (); ++i)
* {
* ok |= csv.GetValue (i, v[i]);
* }
* if (!ok) ...
* \endcode
*
*
* File Format
* ===========
*
* This parser implements \RFC{4180}, but with several restrictions removed;
* see below for differences. All the formatting features described next
* are illustrated in the examples which which follow.
*
* Comments
* --------
*
* The hash character (#) is used to indicate the start of a comment. Comments
* are not parsed by the reader. Comments are treated as either an empty column
* or part of an existing column depending on where the comment is located.
* Comments that are found at the end of a line containing data are ignored.
*
* 1,2 # This comment ignored, leaving two data columns
*
* Lines that contain a comment and no data are treated as rows with a single
* empty column, meaning that ColumnCount will return 1 and
* GetValue() will return an empty string.
*
* # This row treated as a single empty column, returning an empty string.
* "" # So is this
*
* IsBlankLine() will return \c true in either of these cases.
*
* Quoted Columns
* --------------
*
* Columns with string data which contain the delimiter character or
* the hash character can be wrapped in double quotes to prevent CsvReader
* from treating them as special characters.
*
* 3,string without delimiter,"String with comma ',' delimiter"
*
* Double quotes can be escaped
* by doubling up the quotes inside a quoted field. See example 6 below for
* a demonstration.
*
* Whitespace
* ----------
*
* Leading and trailing whitespace are ignored by the reader and are not
* stored in the column data.
*
* 4,5 , 6 # Columns contain '4', '5', '6'
*
* If leading or trailing whitespace are important
* for a column, wrap the column in double quotes as discussed above.
*
* 7,"8 "," 9" # Columns contain '7', '8 ', ' 9'
*
* Trailing Delimiter
* ------------------
*
* Trailing delimiters are ignored; they do _not_ result in an empty column.
*
*
* Differences from RFC 4180
* -------------------------
* Section 2.1
* - Line break can be LF or CRLF
*
* Section 2.3
* - Non-parsed lines are allowed anywhere, not just as a header.
* - Lines do not all have to contain the same number fields.
*
* Section 2.4
* - Characters other than comma can be used to separate fields.
* - Lines do not all have to contain the same number fields.
* - Leading/trailing spaces are stripped from the field
* unless the whitespace is wrapped in double quotes.
* - A trailing delimiter on a line is not an error.
*
* Section 2.6
* - Quoted fields cannot contain line breaks
*
* Examples
* --------
* \par Example 1: Basic
* \code
* # Column 1: Product
* # Column 2: Price
* widget, 12.5
* \endcode
*
* \par Example 2: Comment at end of line
* \code
* # Column 1: Product
* # Column 2: Price
* broken widget, 12.5 # this widget is broken
* \endcode
*
* \par Example 3: Delimiter in double quotes
* \code
* # Column 1: Product
* # Column 2: Price
* # Column 3: Count
* # Column 4: Date
* widget, 12.5, 100, "November 6, 2018"
* \endcode
*
* \par # Example 4: Hash character in double quotes
* \code
* # Column 1: Key
* # Column 2: Value
* # Column 3: Description
* count, 5, "# of widgets currently in stock"
* \endcode
*
* \par Example 5: Extra whitespace
* \code
* # Column 1: Key
* # Column 2: Value
* # Column 3: Description
* count , 5 ,"# of widgets in stock"
* \endcode
*
* \par Example 6: Escaped quotes
* \code
* # Column 1: Key
* # Column 2: Description
* # The value returned for Column 2 will be: String with "embedded" quotes
* foo, "String with ""embedded"" quotes"
* \endcode
*/
// *NS_CHECK_STYLE_ON*
class CsvReader
{
public:
/**
* Constructor
*
* Opens the file specified in the filepath argument and
* reads data from it.
*
* \param filepath Path to a file containing CSV data.
* \param delimiter Character used to separate fields in the data file.
*/
CsvReader (const std::string& filepath, char delimiter = ',');
/**
* Constructor
*
* Reads csv data from the supplied input stream.
*
* \param stream Input stream containing csv data.
* \param delimiter Character used to separate fields in the data stream.
*/
CsvReader (std::istream& stream, char delimiter = ',');
/**
* Destructor
*/
virtual ~CsvReader ();
/**
* Returns the number of columns in the csv data.
*
* \return Number of columns
*/
std::size_t ColumnCount () const;
/**
* The number of lines that have been read.
*
* \return The number of lines that have been read.
*/
std::size_t RowNumber () const;
/**
* Returns the delimiter character specified during object construction.
*
* \return Character used as the column separator.
*/
char Delimiter () const;
/**
* Reads one line from the input until a new line is encountered.
* The read data is stored in a cache which is accessed by the
* GetValue functions to extract fields from the data.
*
* \return \c true if a line was read successfully or \c false if the
* read failed or reached the end of the file.
*/
bool FetchNextRow ();
/**
* Attempt to convert from the string data in the specified column
* to the specified data type.
*
* \tparam T The data type of the output variable.
*
* \param [in] columnIndex Index of the column to fetch.
* \param [out] value Location where the converted data will be stored.
*
* \return \c true if the specified column has data and the data
* was converted to the specified data type.
*/
template<class T>
bool GetValue (std::size_t columnIndex, T& value) const;
/**
* Check if the current row is blank.
* A blank row can consist of any combination of
*
* - Whitespace
* - Comment
* - Quoted empty string `""`
*
* \returns \c true if the input row is a blank line.
*/
bool IsBlankRow () const;
private:
/**
* Attempt to convert from the string data stored at the specified column
* index into the specified type.
*
* \param input [in] String value to be converted.
* \param value [out] Location where the converted value will be stored.
*
* \return \c true if the column exists and the conversion succeeded,
* \c false otherwise.
*/
/** @{ */
bool GetValueAs (std::string input, double& value) const;
bool GetValueAs (std::string input, float& value) const;
bool GetValueAs (std::string input, signed char& value) const;
bool GetValueAs (std::string input, short& value) const;
bool GetValueAs (std::string input, int& value) const;
bool GetValueAs (std::string input, long& value) const;
bool GetValueAs (std::string input, long long& value) const;
bool GetValueAs (std::string input, std::string& value) const;
bool GetValueAs (std::string input, unsigned char& value) const;
bool GetValueAs (std::string input, unsigned short& value) const;
bool GetValueAs (std::string input, unsigned int& value) const;
bool GetValueAs (std::string input, unsigned long& value) const;
bool GetValueAs (std::string input, unsigned long long& value) const;
/** @} */
/**
* Returns \c true if the supplied character matches the delimiter.
*
* \param c Character to check.
* \return \c true if \pname{c} is the delimiter character,
* \c false otherwise.
*/
bool IsDelimiter (char c) const;
/**
* Scans the string and splits it into individual columns based on the delimiter.
*
* \param [in] line String containing delimiter separated data.
*/
void ParseLine (const std::string& line);
/**
* Extracts the data for one column in a csv row.
*
* \param begin Iterator to the first character in the row.
* \param end Iterator to the last character in the row.
* \return A tuple containing the content of the column and an iterator
* pointing to the position in the row where the column ended.
*/
std::tuple<std::string, std::string::const_iterator>
ParseColumn (std::string::const_iterator begin, std::string::const_iterator end);
/**
* Container of CSV data. Each entry represents one field in a row
* of data. The fields are stored in the same order that they are
* encountered in the CSV data.
*/
typedef std::vector<std::string> Columns;
char m_delimiter; //!< Character used to separate fields.
std::size_t m_rowsRead; //!< Number of lines processed.
Columns m_columns; //!< Fields extracted from the current line.
bool m_blankRow; //!< Line contains no data (blank line or comment only).
std::ifstream m_fileStream; //!< File stream containing the data.
/**
* Pointer to the input stream containing the data.
*/
std::istream* m_stream;
}; // class CsvReader
/****************************************************
* Template implementations.
***************************************************/
template<class T>
bool
CsvReader::GetValue (std::size_t columnIndex, T& value) const
{
if ( columnIndex >= ColumnCount () )
{
return false;
}
std::string cell = m_columns[columnIndex];
return GetValueAs (std::move (cell), value);
}
} // namespace ns3
#endif // NS3_CSV_READER_H_

View File

@@ -235,6 +235,7 @@ def build(bld):
'model/time-printer.cc',
'model/show-progress.cc',
'model/system-wall-clock-timestamp.cc',
'helper/csv-reader.cc',
]
if (bld.env['ENABLE_EXAMPLES']):
@@ -368,6 +369,7 @@ def build(bld):
'model/node-printer.h',
'model/time-printer.h',
'model/show-progress.h',
'helper/csv-reader.h',
]
if (bld.env['ENABLE_EXAMPLES']):