#define LIBXML_HTML_ENABLED #include #include #pragma warning(push, 0) #include #include #include #pragma warning(pop) namespace scwx { namespace network { static const std::string logPrefix_ = "scwx::network::dir_list"; static const auto logger_ = util::Logger::Create(logPrefix_); static const cpr::SslOptions kSslOptions_ = cpr::Ssl(cpr::ssl::TLSv1_2 {}); static const cpr::HttpVersion kHttpVersion_ { cpr::HttpVersionCode::VERSION_2_0_TLS}; class DirListSAXHandler { public: DirListSAXHandler() = delete; static void StartDocument(void* userData); static void EndDocument(void* userData); static void StartElement(void* userData, const xmlChar* name, const xmlChar** attrs); static void EndElement(void* userData, const xmlChar* name); static void Characters(void* userData, const xmlChar* ch, int len); static void Warning(void* userData, const char* msg, ...); static void Error(void* userData, const char* msg, ...); static void Critical(void* userData, const char* msg, ...); }; struct DirListSAXData { enum class State { FindingLink, FoundLink, UpdateLinkTimestamp, UpdateLinkSize }; State state_ {State::FindingLink}; size_t warningCount_ {0u}; size_t errorCount_ {0u}; size_t criticalCount_ {0u}; std::vector records_; }; static htmlSAXHandler saxHandler_ // {.startElement = &DirListSAXHandler::StartElement, .endElement = &DirListSAXHandler::EndElement, .characters = &DirListSAXHandler::Characters, .warning = &DirListSAXHandler::Warning, .error = &DirListSAXHandler::Error, .fatalError = &DirListSAXHandler::Critical}; std::vector DirList(const std::string& baseUrl) { using namespace std::chrono; logger_->trace("DirList: {}", baseUrl); cpr::Response response = cpr::Get(cpr::Url {baseUrl}, kSslOptions_, kHttpVersion_); DirListSAXData saxData {}; if (response.status_code != cpr::status::HTTP_OK) { logger_->warn("Bad response from {}: {} ({})", baseUrl, response.error.message, response.status_code); } else { htmlDocPtr doc = htmlSAXParseDoc( reinterpret_cast(response.text.c_str()), nullptr, &saxHandler_, &saxData); if (doc != nullptr) { xmlFreeDoc(doc); } } return saxData.records_; } void DirListSAXHandler::StartElement(void* userData, const xmlChar* name, const xmlChar** attrs) { logger_->trace("SAX: Start Element: {}", name); DirListSAXData* data = reinterpret_cast(userData); if (strcmp(reinterpret_cast(name), "a") == 0) { // If an "a" element is found, search for an "href" attribute for (int i = 0; attrs != nullptr && attrs[i] != nullptr; ++i) { if (i > 0 && strcmp(reinterpret_cast(attrs[i - 1]), "href") == 0) { // If the "href" attribute is found, treat this as a new file std::string filename {reinterpret_cast(attrs[i])}; std::filesystem::file_type fileType; // Determine if the file is a directory if (filename.ends_with("/")) { filename.pop_back(); fileType = std::filesystem::file_type::directory; } else { fileType = std::filesystem::file_type::regular; } // If the filename is valid, add it as a record if (filename.size() > 0 && !filename.starts_with("?") && // And the filename is not a duplicate of the previous record (data->records_.size() == 0 || data->records_.back().filename_ != filename)) { data->records_.emplace_back(filename, fileType); data->state_ = DirListSAXData::State::FoundLink; break; } } } } for (int i = 0; attrs != nullptr && attrs[i] != nullptr; ++i) { logger_->trace(" Attribute: {}", attrs[i]); } } void DirListSAXHandler::EndElement(void* userData, const xmlChar* name) { logger_->trace("SAX: End Element: {}", name); DirListSAXData* data = reinterpret_cast(userData); if (data->state_ == DirListSAXData::State::FoundLink && strcmp(reinterpret_cast(name), "a") == 0) { // The "a" element is closed, so begin looking for the timestamp data->state_ = DirListSAXData::State::UpdateLinkTimestamp; } } void DirListSAXHandler::Characters(void* userData, const xmlChar* ch, int len) { std::string characters(reinterpret_cast(ch), len); logger_->trace("SAX: Characters: {}", characters); DirListSAXData* data = reinterpret_cast(userData); if (data->state_ == DirListSAXData::State::UpdateLinkTimestamp) { using namespace std::chrono; // Date time format: yyyy-mm-dd hh:mm static const std::string kDateTimeFormat {"%Y-%m-%d %H:%M"}; static constexpr size_t kDateTimeSize {16u}; // Attempt to parse the date time std::istringstream ssCharacters {characters}; sys_time mtime; ssCharacters >> parse(kDateTimeFormat, mtime); if (!ssCharacters.fail()) { // Date time parsing succeeded, look for link size auto& record = data->records_.back(); record.mtime_ = mtime; if (record.type_ == std::filesystem::file_type::directory) { // If the record is a directory, there is no size, skip to next link data->state_ = DirListSAXData::State::FindingLink; } else { // After the time is parsed, get the file size data->state_ = DirListSAXData::State::UpdateLinkSize; } } } else if (data->state_ == DirListSAXData::State::UpdateLinkSize) { // Trim the file size string std::string fileSizeString {characters}; boost::trim(fileSizeString); size_t fileSize = 0u; size_t multiplier = 1u; // Look for size suffix if (fileSizeString.ends_with("K")) { fileSizeString.pop_back(); multiplier = 1024u; } else if (fileSizeString.ends_with("M")) { fileSizeString.pop_back(); multiplier = 1024u * 1024u; } else if (fileSizeString.ends_with("G")) { fileSizeString.pop_back(); multiplier = 1024u * 1024u * 1024u; } else if (fileSizeString.ends_with("T")) { fileSizeString.pop_back(); multiplier = 1024ull * 1024ull * 1024ull * 1024ull; } try { // Parse the remaining file size string, and multiply by the suffix fileSize = static_cast(std::stod(fileSizeString) * multiplier); data->records_.back().size_ = fileSize; // Look for the next link data->state_ = DirListSAXData::State::FindingLink; } catch (const std::exception&) { // This was something other than a file size } } } void DirListSAXHandler::Warning(void* /* userData */, const char* msg, ...) { logger_->warn("SAX: {}", msg); } void DirListSAXHandler::Error(void* /* userData */, const char* msg, ...) { logger_->error("SAX: {}", msg); } void DirListSAXHandler::Critical(void* /* userData */, const char* msg, ...) { logger_->critical("SAX: {}", msg); } } // namespace network } // namespace scwx