Directory listing

This commit is contained in:
Dan Paulat 2022-11-02 21:59:16 -05:00
parent b9fc22d63e
commit d5d9285736
6 changed files with 337 additions and 0 deletions

View file

@ -14,15 +14,19 @@ include(${PROJECT_SOURCE_DIR}/external/cmake-conan/conan.cmake)
conan_cmake_configure(REQUIRES aws-sdk-cpp/1.9.234
boost/1.78.0
cpr/1.9.0
freetype/2.12.1
geographiclib/1.52
glew/2.2.0
glm/cci.20220420
gtest/cci.20210126
libcurl/7.85.0
libxml2/2.9.14
openssl/1.1.1q
spdlog/1.10.0
sqlite3/3.39.4
vulkan-loader/1.3.221
zlib/1.2.13
GENERATORS cmake
cmake_find_package
cmake_paths

View file

@ -0,0 +1,28 @@
#include <scwx/network/dir_list.hpp>
#include <gtest/gtest.h>
namespace scwx
{
namespace network
{
static const std::string& kDefaultUrl {"https://warnings.allisonhouse.com"};
static const std::string& kAlternateUrl {"http://warnings.cod.edu"};
TEST(DirList, GetDefaultUrl)
{
auto records = DirList(kDefaultUrl);
EXPECT_GT(records.size(), 0);
}
TEST(DirList, GetAlternateUrl)
{
auto records = DirList(kAlternateUrl);
EXPECT_GT(records.size(), 0);
}
} // namespace network
} // namespace scwx

View file

@ -15,6 +15,7 @@ set(SRC_AWIPS_TESTS source/scwx/awips/coded_location.test.cpp
source/scwx/awips/ugc.test.cpp)
set(SRC_COMMON_TESTS source/scwx/common/color_table.test.cpp
source/scwx/common/products.test.cpp)
set(SRC_NETWORK_TESTS source/scwx/network/dir_list.test.cpp)
set(SRC_PROVIDER_TESTS source/scwx/provider/aws_level2_data_provider.test.cpp
source/scwx/provider/aws_level3_data_provider.test.cpp)
set(SRC_QT_CONFIG_TESTS source/scwx/qt/config/county_database.test.cpp
@ -33,6 +34,7 @@ set(CMAKE_FILES test.cmake)
add_executable(wxtest ${SRC_MAIN}
${SRC_AWIPS_TESTS}
${SRC_COMMON_TESTS}
${SRC_NETWORK_TESTS}
${SRC_PROVIDER_TESTS}
${SRC_QT_CONFIG_TESTS}
${SRC_QT_MANAGER_TESTS}
@ -43,6 +45,7 @@ add_executable(wxtest ${SRC_MAIN}
source_group("Source Files\\main" FILES ${SRC_MAIN})
source_group("Source Files\\awips" FILES ${SRC_AWIPS_TESTS})
source_group("Source Files\\common" FILES ${SRC_COMMON_TESTS})
source_group("Source Files\\network" FILES ${SRC_NETWORK_TESTS})
source_group("Source Files\\provider" FILES ${SRC_PROVIDER_TESTS})
source_group("Source Files\\qt\\config" FILES ${SRC_QT_CONFIG_TESTS})
source_group("Source Files\\qt\\manager" FILES ${SRC_QT_MANAGER_TESTS})

View file

@ -0,0 +1,30 @@
#pragma once
#include <chrono>
#include <filesystem>
#include <string>
namespace scwx
{
namespace network
{
struct DirListRecord
{
std::string filename_ = {};
std::filesystem::file_type type_ = std::filesystem::file_type::unknown;
std::chrono::system_clock::time_point mtime_ =
{}; ///< Modified time (server time)
size_t size_ = 0u; ///< Approximate file size in bytes
};
/**
* @brief Retrieve Directory Listing
*
* Retrieves a directory listing. Supports default Apache-style directory
* listings only.
*/
std::vector<DirListRecord> DirList(const std::string& baseUrl);
} // namespace network
} // namespace scwx

View file

@ -0,0 +1,262 @@
#define LIBXML_HTML_ENABLED
#include <scwx/network/dir_list.hpp>
#include <scwx/util/logger.hpp>
#pragma warning(push, 0)
#include <boost/algorithm/string/trim.hpp>
#include <cpr/cpr.h>
#include <libxml/HTMLparser.h>
#pragma warning(pop)
namespace scwx
{
namespace network
{
static const std::string logPrefix_ = "scwx::network::dir_list";
static const auto logger_ = util::Logger::Create(logPrefix_);
static const cpr::SslOptions kSslOptions_ = cpr::Ssl(cpr::ssl::TLSv1_2 {});
static const cpr::HttpVersion kHttpVersion_ {
cpr::HttpVersionCode::VERSION_2_0_TLS};
class DirListSAXHandler
{
public:
DirListSAXHandler() = delete;
static void StartDocument(void* userData);
static void EndDocument(void* userData);
static void
StartElement(void* userData, const xmlChar* name, const xmlChar** attrs);
static void EndElement(void* userData, const xmlChar* name);
static void Characters(void* userData, const xmlChar* ch, int len);
static void Warning(void* userData, const char* msg, ...);
static void Error(void* userData, const char* msg, ...);
static void Critical(void* userData, const char* msg, ...);
};
struct DirListSAXData
{
enum class State
{
FindingLink,
FoundLink,
UpdateLinkTimestamp,
UpdateLinkSize
};
State state_ {State::FindingLink};
size_t warningCount_ {0u};
size_t errorCount_ {0u};
size_t criticalCount_ {0u};
std::vector<DirListRecord> records_;
};
static htmlSAXHandler saxHandler_ //
{.startElement = &DirListSAXHandler::StartElement,
.endElement = &DirListSAXHandler::EndElement,
.characters = &DirListSAXHandler::Characters,
.warning = &DirListSAXHandler::Warning,
.error = &DirListSAXHandler::Error,
.fatalError = &DirListSAXHandler::Critical};
std::vector<DirListRecord> DirList(const std::string& baseUrl)
{
using namespace std::chrono;
logger_->trace("DirList: {}", baseUrl);
cpr::Response response =
cpr::Get(cpr::Url {baseUrl}, kSslOptions_, kHttpVersion_);
DirListSAXData saxData {};
if (response.status_code != cpr::status::HTTP_OK)
{
logger_->warn("Bad response from {}: {} ({})",
baseUrl,
response.error.message,
response.status_code);
}
else
{
htmlDocPtr doc = htmlSAXParseDoc(
reinterpret_cast<const xmlChar*>(response.text.c_str()),
nullptr,
&saxHandler_,
&saxData);
if (doc != nullptr)
{
xmlFreeDoc(doc);
}
}
return saxData.records_;
}
void DirListSAXHandler::StartElement(void* userData,
const xmlChar* name,
const xmlChar** attrs)
{
logger_->trace("SAX: Start Element: {}", name);
DirListSAXData* data = reinterpret_cast<DirListSAXData*>(userData);
if (strcmp(reinterpret_cast<const char*>(name), "a") == 0)
{
// If an "a" element is found, search for an "href" attribute
for (int i = 0; attrs != nullptr && attrs[i] != nullptr; ++i)
{
if (i > 0 &&
strcmp(reinterpret_cast<const char*>(attrs[i - 1]), "href") == 0)
{
// If the "href" attribute is found, treat this as a new file
std::string filename {reinterpret_cast<const char*>(attrs[i])};
std::filesystem::file_type fileType;
// Determine if the file is a directory
if (filename.ends_with("/"))
{
filename.pop_back();
fileType = std::filesystem::file_type::directory;
}
else
{
fileType = std::filesystem::file_type::regular;
}
// If the filename is valid, add it as a record
if (filename.size() > 0 && !filename.starts_with("?") &&
// And the filename is not a duplicate of the previous record
(data->records_.size() == 0 ||
data->records_.back().filename_ != filename))
{
data->records_.emplace_back(filename, fileType);
data->state_ = DirListSAXData::State::FoundLink;
break;
}
}
}
}
for (int i = 0; attrs != nullptr && attrs[i] != nullptr; ++i)
{
logger_->trace(" Attribute: {}", attrs[i]);
}
}
void DirListSAXHandler::EndElement(void* userData, const xmlChar* name)
{
logger_->trace("SAX: End Element: {}", name);
DirListSAXData* data = reinterpret_cast<DirListSAXData*>(userData);
if (data->state_ == DirListSAXData::State::FoundLink &&
strcmp(reinterpret_cast<const char*>(name), "a") == 0)
{
// The "a" element is closed, so begin looking for the timestamp
data->state_ = DirListSAXData::State::UpdateLinkTimestamp;
}
}
void DirListSAXHandler::Characters(void* userData, const xmlChar* ch, int len)
{
std::string characters(reinterpret_cast<const char*>(ch), len);
logger_->trace("SAX: Characters: {}", characters);
DirListSAXData* data = reinterpret_cast<DirListSAXData*>(userData);
if (data->state_ == DirListSAXData::State::UpdateLinkTimestamp)
{
using namespace std::chrono;
// Date time format: yyyy-mm-dd hh:mm
static const std::string kDateTimeFormat {"%Y-%m-%d %H:%M"};
static constexpr size_t kDateTimeSize {16u};
// Attempt to parse the date time
std::istringstream ssCharacters {characters};
sys_time<minutes> mtime;
ssCharacters >> parse(kDateTimeFormat, mtime);
if (!ssCharacters.fail())
{
// Date time parsing succeeded, look for link size
auto& record = data->records_.back();
record.mtime_ = mtime;
if (record.type_ == std::filesystem::file_type::directory)
{
// If the record is a directory, there is no size, skip to next link
data->state_ = DirListSAXData::State::FindingLink;
}
else
{
// After the time is parsed, get the file size
data->state_ = DirListSAXData::State::UpdateLinkSize;
}
}
}
else if (data->state_ == DirListSAXData::State::UpdateLinkSize)
{
// Trim the file size string
std::string fileSizeString {characters};
boost::trim(fileSizeString);
size_t fileSize = 0u;
size_t multiplier = 1u;
// Look for size suffix
if (fileSizeString.ends_with("K"))
{
fileSizeString.pop_back();
multiplier = 1024u;
}
else if (fileSizeString.ends_with("M"))
{
fileSizeString.pop_back();
multiplier = 1024u * 1024u;
}
else if (fileSizeString.ends_with("G"))
{
fileSizeString.pop_back();
multiplier = 1024u * 1024u * 1024u;
}
else if (fileSizeString.ends_with("T"))
{
fileSizeString.pop_back();
multiplier = 1024ull * 1024ull * 1024ull * 1024ull;
}
try
{
// Parse the remaining file size string, and multiply by the suffix
fileSize = static_cast<size_t>(std::stod(fileSizeString) * multiplier);
data->records_.back().size_ = fileSize;
// Look for the next link
data->state_ = DirListSAXData::State::FindingLink;
}
catch (const std::exception&)
{
// This was something other than a file size
}
}
}
void DirListSAXHandler::Warning(void* /* userData */, const char* msg, ...)
{
logger_->warn("SAX: {}", msg);
}
void DirListSAXHandler::Error(void* /* userData */, const char* msg, ...)
{
logger_->error("SAX: {}", msg);
}
void DirListSAXHandler::Critical(void* /* userData */, const char* msg, ...)
{
logger_->critical("SAX: {}", msg);
}
} // namespace network
} // namespace scwx

View file

@ -4,6 +4,8 @@ project(scwx-data)
find_package(AWSSDK)
find_package(Boost)
find_package(cpr)
find_package(LibXml2)
find_package(spdlog)
set(HDR_AWIPS include/scwx/awips/coded_location.hpp
@ -40,6 +42,8 @@ set(SRC_COMMON source/scwx/common/characters.cpp
source/scwx/common/products.cpp
source/scwx/common/sites.cpp
source/scwx/common/vcp.cpp)
set(HDR_NETWORK include/scwx/network/dir_list.hpp)
set(SRC_NETWORK source/scwx/network/dir_list.cpp)
set(HDR_PROVIDER include/scwx/provider/aws_level2_data_provider.hpp
include/scwx/provider/aws_level3_data_provider.hpp
include/scwx/provider/aws_nexrad_data_provider.hpp
@ -186,6 +190,8 @@ add_library(wxdata OBJECT ${HDR_AWIPS}
${SRC_AWIPS}
${HDR_COMMON}
${SRC_COMMON}
${HDR_NETWORK}
${SRC_NETWORK}
${HDR_PROVIDER}
${SRC_PROVIDER}
${HDR_UTIL}
@ -202,6 +208,8 @@ source_group("Header Files\\awips" FILES ${HDR_AWIPS})
source_group("Source Files\\awips" FILES ${SRC_AWIPS})
source_group("Header Files\\common" FILES ${HDR_COMMON})
source_group("Source Files\\common" FILES ${SRC_COMMON})
source_group("Header Files\\network" FILES ${HDR_NETWORK})
source_group("Source Files\\network" FILES ${SRC_NETWORK})
source_group("Header Files\\provider" FILES ${HDR_PROVIDER})
source_group("Source Files\\provider" FILES ${SRC_PROVIDER})
source_group("Header Files\\util" FILES ${HDR_UTIL})
@ -225,6 +233,8 @@ target_compile_options(wxdata PRIVATE
)
target_link_libraries(wxdata PUBLIC AWS::s3
cpr::cpr
LibXml2::LibXml2
spdlog::spdlog)
target_link_libraries(wxdata INTERFACE Boost::iostreams
BZip2::BZip2