Create custom string tokenizer for use in placefile parsing

- Avoids the use of regular expressions, and is expected to be more efficient with large placefiles
2025-10-30 08:00:06 +00:00 · 2023-07-16 23:59:28 -05:00 · 2023-07-16 23:59:28 -05:00 · 6767c0c50a
commit 6767c0c50a
parent 88475f5b0e
4 changed files with 148 additions and 0 deletions
--- a/test/source/scwx/util/strings.test.cpp
+++ b/test/source/scwx/util/strings.test.cpp
@ -0,0 +1,62 @@
+#include <scwx/util/strings.hpp>
+
+#include <gtest/gtest.h>
+
+namespace scwx
+{
+namespace util
+{
+
+TEST(StringsTest, ParseTokensColor)
+{
+   static const std::string line {"Color: red green blue alpha discarded"};
+   static const std::vector<std::string> delimiters {":", " ", " ", " ", " "};
+
+   std::vector<std::string> tokens = ParseTokens(line, delimiters);
+
+   ASSERT_EQ(tokens.size(), 6);
+   EXPECT_EQ(tokens[0], "Color");
+   EXPECT_EQ(tokens[1], "red");
+   EXPECT_EQ(tokens[2], "green");
+   EXPECT_EQ(tokens[3], "blue");
+   EXPECT_EQ(tokens[4], "alpha");
+   EXPECT_EQ(tokens[5], "discarded");
+}
+
+TEST(StringsTest, ParseTokensColorOffset)
+{
+   static const std::string              line {"Color: red green blue alpha"};
+   static const std::vector<std::string> delimiters {" ", " ", " ", " "};
+   static const std::size_t              offset = std::string {"Color:"}.size();
+
+   std::vector<std::string> tokens = ParseTokens(line, delimiters, offset);
+
+   ASSERT_EQ(tokens.size(), 4);
+   EXPECT_EQ(tokens[0], "red");
+   EXPECT_EQ(tokens[1], "green");
+   EXPECT_EQ(tokens[2], "blue");
+   EXPECT_EQ(tokens[3], "alpha");
+}
+
+TEST(StringsTest, ParseTokensText)
+{
+   static const std::string line {
+      "Text: lat, lon, fontNumber, \"string, string\", \"hover, hover\", "
+      "discarded"};
+   static const std::vector<std::string> delimiters {
+      ":", ",", ",", ",", ",", ","};
+
+   std::vector<std::string> tokens = ParseTokens(line, delimiters);
+
+   ASSERT_EQ(tokens.size(), 7);
+   EXPECT_EQ(tokens[0], "Text");
+   EXPECT_EQ(tokens[1], "lat");
+   EXPECT_EQ(tokens[2], "lon");
+   EXPECT_EQ(tokens[3], "fontNumber");
+   EXPECT_EQ(tokens[4], "\"string, string\"");
+   EXPECT_EQ(tokens[5], "\"hover, hover\"");
+   EXPECT_EQ(tokens[6], "discarded");
+}
+
+} // namespace util
+} // namespace scwx
--- a/test/test.cmake
+++ b/test/test.cmake
@ -31,6 +31,7 @@ set(SRC_QT_UTIL_TESTS source/scwx/qt/util/q_file_input_stream.test.cpp)
 set(SRC_UTIL_TESTS source/scwx/util/float.test.cpp
                   source/scwx/util/rangebuf.test.cpp
                   source/scwx/util/streams.test.cpp
+                   source/scwx/util/strings.test.cpp
                   source/scwx/util/vectorbuf.test.cpp)
 set(SRC_WSR88D_TESTS source/scwx/wsr88d/ar2v_file.test.cpp
                     source/scwx/wsr88d/level3_file.test.cpp
--- a/wxdata/include/scwx/util/strings.hpp
+++ b/wxdata/include/scwx/util/strings.hpp
@ -8,6 +8,25 @@ namespace scwx
 namespace util
 {

+/**
+ * @brief Parse a list of tokens from a string
+ *
+ * This function will take an input string, and apply the delimiters vector in
+ * order to tokenize the string. Each set of delimiters in the delimiters vector
+ * will be used once. A set of delimiters will be used to match any character,
+ * rather than a sequence of characters. Tokens are automatically trimmed of any
+ * whitespace.
+ *
+ * @param [in] s Input string to tokenize
+ * @param [in] delimiters A vector of delimiters to use for each token.
+ * @param [in] pos Search begin position. Default is 0.
+ *
+ * @return Tokenized string
+ */
+std::vector<std::string> ParseTokens(const std::string&       s,
+                                     std::vector<std::string> delimiters,
+                                     std::size_t              pos = 0);
+
 std::string ToString(const std::vector<std::string>& v);

 } // namespace util
--- a/wxdata/source/scwx/util/strings.cpp
+++ b/wxdata/source/scwx/util/strings.cpp
@ -1,10 +1,76 @@
 #include <scwx/util/strings.hpp>

+#include <boost/algorithm/string/trim.hpp>
+
 namespace scwx
 {
 namespace util
 {

+std::vector<std::string> ParseTokens(const std::string&       s,
+                                     std::vector<std::string> delimiters,
+                                     std::size_t              pos)
+{
+   std::vector<std::string> tokens {};
+   std::size_t              findPos {};
+
+   // Iterate through each delimiter
+   for (std::size_t i = 0; i < delimiters.size() && pos != std::string::npos;
+        ++i)
+   {
+      // Skip leading spaces
+      while (pos < s.size() && std::isspace(s[pos]))
+      {
+         ++pos;
+      }
+
+      if (pos < s.size() && s[pos] == '"')
+      {
+         // Do not search for a delimeter within a quoted string
+         findPos = s.find('"', pos + 1);
+
+         // Increment search start to one after quotation mark
+         if (findPos != std::string::npos)
+         {
+            ++findPos;
+         }
+      }
+      else
+      {
+         // Search starting at the current position
+         findPos = pos;
+      }
+
+      // Search for delimiter
+      std::size_t nextPos = s.find_first_of(delimiters[i], findPos);
+
+      // If the delimiter was not found, stop processing tokens
+      if (nextPos == std::string::npos)
+      {
+         break;
+      }
+
+      // Add the current substring as a token
+      auto& newToken = tokens.emplace_back(s.substr(pos, nextPos - pos));
+      boost::trim(newToken);
+
+      // Increment nextPos until the next non-space character
+      while (++nextPos < s.size() && std::isspace(s[nextPos])) {}
+
+      // Store new position value
+      pos = nextPos;
+   }
+
+   // Add the remainder of the string as a token
+   if (pos < s.size())
+   {
+      auto& newToken = tokens.emplace_back(s.substr(pos));
+      boost::trim(newToken);
+   }
+
+   return tokens;
+}
+
 std::string ToString(const std::vector<std::string>& v)
 {
   std::string value {};