Create custom string tokenizer for use in placefile parsing

- Avoids the use of regular expressions, and is expected to be more efficient with large placefiles
2025-10-30 15:40:04 +00:00 · 2023-07-16 23:59:28 -05:00 · 2023-07-16 23:59:28 -05:00 · 6767c0c50a
commit 6767c0c50a
parent 88475f5b0e
4 changed files with 148 additions and 0 deletions
--- a/wxdata/include/scwx/util/strings.hpp
+++ b/wxdata/include/scwx/util/strings.hpp
@ -8,6 +8,25 @@ namespace scwx
 namespace util
 {

+/**
+ * @brief Parse a list of tokens from a string
+ *
+ * This function will take an input string, and apply the delimiters vector in
+ * order to tokenize the string. Each set of delimiters in the delimiters vector
+ * will be used once. A set of delimiters will be used to match any character,
+ * rather than a sequence of characters. Tokens are automatically trimmed of any
+ * whitespace.
+ *
+ * @param [in] s Input string to tokenize
+ * @param [in] delimiters A vector of delimiters to use for each token.
+ * @param [in] pos Search begin position. Default is 0.
+ *
+ * @return Tokenized string
+ */
+std::vector<std::string> ParseTokens(const std::string&       s,
+                                     std::vector<std::string> delimiters,
+                                     std::size_t              pos = 0);
+
 std::string ToString(const std::vector<std::string>& v);

 } // namespace util
--- a/wxdata/source/scwx/util/strings.cpp
+++ b/wxdata/source/scwx/util/strings.cpp
@ -1,10 +1,76 @@
 #include <scwx/util/strings.hpp>

+#include <boost/algorithm/string/trim.hpp>
+
 namespace scwx
 {
 namespace util
 {

+std::vector<std::string> ParseTokens(const std::string&       s,
+                                     std::vector<std::string> delimiters,
+                                     std::size_t              pos)
+{
+   std::vector<std::string> tokens {};
+   std::size_t              findPos {};
+
+   // Iterate through each delimiter
+   for (std::size_t i = 0; i < delimiters.size() && pos != std::string::npos;
+        ++i)
+   {
+      // Skip leading spaces
+      while (pos < s.size() && std::isspace(s[pos]))
+      {
+         ++pos;
+      }
+
+      if (pos < s.size() && s[pos] == '"')
+      {
+         // Do not search for a delimeter within a quoted string
+         findPos = s.find('"', pos + 1);
+
+         // Increment search start to one after quotation mark
+         if (findPos != std::string::npos)
+         {
+            ++findPos;
+         }
+      }
+      else
+      {
+         // Search starting at the current position
+         findPos = pos;
+      }
+
+      // Search for delimiter
+      std::size_t nextPos = s.find_first_of(delimiters[i], findPos);
+
+      // If the delimiter was not found, stop processing tokens
+      if (nextPos == std::string::npos)
+      {
+         break;
+      }
+
+      // Add the current substring as a token
+      auto& newToken = tokens.emplace_back(s.substr(pos, nextPos - pos));
+      boost::trim(newToken);
+
+      // Increment nextPos until the next non-space character
+      while (++nextPos < s.size() && std::isspace(s[nextPos])) {}
+
+      // Store new position value
+      pos = nextPos;
+   }
+
+   // Add the remainder of the string as a token
+   if (pos < s.size())
+   {
+      auto& newToken = tokens.emplace_back(s.substr(pos));
+      boost::trim(newToken);
+   }
+
+   return tokens;
+}
+
 std::string ToString(const std::vector<std::string>& v)
 {
   std::string value {};