Create custom string tokenizer for use in placefile parsing

- Avoids the use of regular expressions, and is expected to be more efficient with large placefiles
This commit is contained in:
Dan Paulat 2023-07-16 23:59:28 -05:00
parent 88475f5b0e
commit 6767c0c50a
4 changed files with 148 additions and 0 deletions

View file

@ -0,0 +1,62 @@
#include <scwx/util/strings.hpp>
#include <gtest/gtest.h>
namespace scwx
{
namespace util
{
TEST(StringsTest, ParseTokensColor)
{
static const std::string line {"Color: red green blue alpha discarded"};
static const std::vector<std::string> delimiters {":", " ", " ", " ", " "};
std::vector<std::string> tokens = ParseTokens(line, delimiters);
ASSERT_EQ(tokens.size(), 6);
EXPECT_EQ(tokens[0], "Color");
EXPECT_EQ(tokens[1], "red");
EXPECT_EQ(tokens[2], "green");
EXPECT_EQ(tokens[3], "blue");
EXPECT_EQ(tokens[4], "alpha");
EXPECT_EQ(tokens[5], "discarded");
}
TEST(StringsTest, ParseTokensColorOffset)
{
static const std::string line {"Color: red green blue alpha"};
static const std::vector<std::string> delimiters {" ", " ", " ", " "};
static const std::size_t offset = std::string {"Color:"}.size();
std::vector<std::string> tokens = ParseTokens(line, delimiters, offset);
ASSERT_EQ(tokens.size(), 4);
EXPECT_EQ(tokens[0], "red");
EXPECT_EQ(tokens[1], "green");
EXPECT_EQ(tokens[2], "blue");
EXPECT_EQ(tokens[3], "alpha");
}
TEST(StringsTest, ParseTokensText)
{
static const std::string line {
"Text: lat, lon, fontNumber, \"string, string\", \"hover, hover\", "
"discarded"};
static const std::vector<std::string> delimiters {
":", ",", ",", ",", ",", ","};
std::vector<std::string> tokens = ParseTokens(line, delimiters);
ASSERT_EQ(tokens.size(), 7);
EXPECT_EQ(tokens[0], "Text");
EXPECT_EQ(tokens[1], "lat");
EXPECT_EQ(tokens[2], "lon");
EXPECT_EQ(tokens[3], "fontNumber");
EXPECT_EQ(tokens[4], "\"string, string\"");
EXPECT_EQ(tokens[5], "\"hover, hover\"");
EXPECT_EQ(tokens[6], "discarded");
}
} // namespace util
} // namespace scwx

View file

@ -31,6 +31,7 @@ set(SRC_QT_UTIL_TESTS source/scwx/qt/util/q_file_input_stream.test.cpp)
set(SRC_UTIL_TESTS source/scwx/util/float.test.cpp
source/scwx/util/rangebuf.test.cpp
source/scwx/util/streams.test.cpp
source/scwx/util/strings.test.cpp
source/scwx/util/vectorbuf.test.cpp)
set(SRC_WSR88D_TESTS source/scwx/wsr88d/ar2v_file.test.cpp
source/scwx/wsr88d/level3_file.test.cpp

View file

@ -8,6 +8,25 @@ namespace scwx
namespace util
{
/**
* @brief Parse a list of tokens from a string
*
* This function will take an input string, and apply the delimiters vector in
* order to tokenize the string. Each set of delimiters in the delimiters vector
* will be used once. A set of delimiters will be used to match any character,
* rather than a sequence of characters. Tokens are automatically trimmed of any
* whitespace.
*
* @param [in] s Input string to tokenize
* @param [in] delimiters A vector of delimiters to use for each token.
* @param [in] pos Search begin position. Default is 0.
*
* @return Tokenized string
*/
std::vector<std::string> ParseTokens(const std::string& s,
std::vector<std::string> delimiters,
std::size_t pos = 0);
std::string ToString(const std::vector<std::string>& v);
} // namespace util

View file

@ -1,10 +1,76 @@
#include <scwx/util/strings.hpp>
#include <boost/algorithm/string/trim.hpp>
namespace scwx
{
namespace util
{
std::vector<std::string> ParseTokens(const std::string& s,
std::vector<std::string> delimiters,
std::size_t pos)
{
std::vector<std::string> tokens {};
std::size_t findPos {};
// Iterate through each delimiter
for (std::size_t i = 0; i < delimiters.size() && pos != std::string::npos;
++i)
{
// Skip leading spaces
while (pos < s.size() && std::isspace(s[pos]))
{
++pos;
}
if (pos < s.size() && s[pos] == '"')
{
// Do not search for a delimeter within a quoted string
findPos = s.find('"', pos + 1);
// Increment search start to one after quotation mark
if (findPos != std::string::npos)
{
++findPos;
}
}
else
{
// Search starting at the current position
findPos = pos;
}
// Search for delimiter
std::size_t nextPos = s.find_first_of(delimiters[i], findPos);
// If the delimiter was not found, stop processing tokens
if (nextPos == std::string::npos)
{
break;
}
// Add the current substring as a token
auto& newToken = tokens.emplace_back(s.substr(pos, nextPos - pos));
boost::trim(newToken);
// Increment nextPos until the next non-space character
while (++nextPos < s.size() && std::isspace(s[nextPos])) {}
// Store new position value
pos = nextPos;
}
// Add the remainder of the string as a token
if (pos < s.size())
{
auto& newToken = tokens.emplace_back(s.substr(pos));
boost::trim(newToken);
}
return tokens;
}
std::string ToString(const std::vector<std::string>& v)
{
std::string value {};