

// Copyright (C) 2025 Simon Quigley <>
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <>.
#include "sources_parser.h"
#include "utilities.h"
#include "/usr/include/archive.h"
#include <archive_entry.h>
#include <curl/curl.h>
#include <algorithm>
#include <cctype>
#include <iostream>
#include <regex>
#include <sstream>
#include <stdexcept>
#include <map>
#include <set>
#include <vector>
#include <optional>
#include <fstream> // Added to resolve ofstream errors
#include <set>
#include <ranges>
#include <QtCore/QJsonArray>
#include <QtCore/QJsonDocument>
#include <QtCore/QJsonObject>
namespace SourcesParser {
// Function to write data fetched by libcurl into a std::vector<char>
size_t WriteCallback(void* contents, size_t size, size_t nmemb, void* userp) {
size_t totalSize = size * nmemb;
auto* buffer = static_cast<std::vector<char>*>(userp);
buffer->insert(buffer->end(), static_cast<char*>(contents), static_cast<char*>(contents) + totalSize);
return totalSize;
// Function to parse dependency relations
std::vector<std::vector<PackageInfo::ParsedRelation>> parse_relations(const std::string& raw) {
std::vector<std::vector<PackageInfo::ParsedRelation>> result;
// Split by comma to get top-level dependencies
std::regex comma_sep_RE(R"(\s*,\s*)");
std::sregex_token_iterator comma_it(raw.begin(), raw.end(), comma_sep_RE, -1);
std::sregex_token_iterator comma_end;
for (; comma_it != comma_end; ++comma_it) {
std::string top_dep = comma_it->str();
// Split by pipe to get alternative dependencies
std::regex pipe_sep_RE(R"(\s*\|\s*)");
std::sregex_token_iterator pipe_it(top_dep.begin(), top_dep.end(), pipe_sep_RE, -1);
std::sregex_token_iterator pipe_end;
std::vector<PackageInfo::ParsedRelation> alternatives;
for (; pipe_it != pipe_end; ++pipe_it) {
std::string dep = pipe_it->str();
// Remove any version constraints or architecture qualifiers
size_t pos_space = dep.find(' ');
size_t pos_paren = dep.find('(');
size_t pos = std::string::npos;
if (pos_space != std::string::npos && pos_paren != std::string::npos) {
pos = std::min(pos_space, pos_paren);
else if (pos_space != std::string::npos) {
pos = pos_space;
else if (pos_paren != std::string::npos) {
pos = pos_paren;
if (pos != std::string::npos) {
dep = dep.substr(0, pos);
// Trim whitespace
dep.erase(dep.find_last_not_of(" \t\n\r\f\v") + 1);
dep.erase(0, dep.find_first_not_of(" \t\n\r\f\v"));
// Handle architecture qualifiers (e.g., "libc6 (>= 2.27)")
std::regex arch_RE(R"(^([a-zA-Z0-9+\-\.]+)(?:\s*\(\s*([a-zA-Z]+)\s*([<>=]+)\s*([0-9a-zA-Z:\-+~.]+)\s*\))?$)");
std::smatch match;
if (std::regex_match(dep, match, arch_RE)) {
PackageInfo::ParsedRelation pr; = match[1];
if (match[2].matched && match[3].matched && match[4].matched) {
// If architecture qualifier exists, store it
pr.archqual = match[2].str() + match[3].str() + match[4].str();
if (match[3].matched && match[4].matched) {
// Store version constraints
pr.version = std::make_pair(match[3].str(), match[4].str());
else {
// If regex does not match, include raw dependency without qualifiers
dep = remove_suffix(dep, ":any");
dep = remove_suffix(dep, ":native");
PackageInfo::ParsedRelation pr; = dep;
std::cerr << "Warning: Cannot parse dependency relation \"" << dep << "\", returning it raw.\n";
if (!alternatives.empty()) {
return result;
// Function to download, decompress, and parse the Sources.gz data
std::optional<std::vector<PackageInfo>> fetch_and_parse_sources(const std::string& url) {
CURL* curl = curl_easy_init();
if (!curl) {
std::cerr << "Failed to initialize CURL.\n";
return std::nullopt;
std::vector<char> downloadedData;
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &downloadedData);
// Follow redirects if any
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
// Set a user agent
curl_easy_setopt(curl, CURLOPT_USERAGENT, "SourcesParser/1.0");
CURLcode res = curl_easy_perform(curl);
if (res != CURLE_OK) {
std::cerr << "CURL download error (Sources.gz): " << curl_easy_strerror(res) << "\n";
return std::nullopt;
// Initialize libarchive
struct archive* a = archive_read_new();
if (archive_read_open_memory(a,, downloadedData.size()) != ARCHIVE_OK) {
std::cerr << "Failed to open Sources.gz archive: " << archive_error_string(a) << "\n";
return std::nullopt;
struct archive_entry* entry;
std::string decompressedData;
// Read all entries (though there should typically be only one)
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
const void* buff;
size_t size;
la_int64_t offset;
while (true) {
int r = archive_read_data_block(a, &buff, &size, &offset);
if (r == ARCHIVE_EOF)
if (r != ARCHIVE_OK) {
std::cerr << "Error during decompression (Sources.gz): " << archive_error_string(a) << "\n";
return std::nullopt;
decompressedData.append(static_cast<const char*>(buff), size);
// Parse the decompressed data
std::vector<PackageInfo> packages;
std::istringstream stream(decompressedData);
std::string line;
PackageInfo currentPackage;
bool in_entry = false;
while (std::getline(stream, line)) {
if (line.empty()) {
if (in_entry && !currentPackage.Package.empty()) {
// Finalize BuildDependsParsed
currentPackage.BuildDependsParsed = parse_relations(currentPackage.BuildDepends);
currentPackage = PackageInfo();
in_entry = false;
in_entry = true;
if (line.find("Build-Depends:") == 0) {
currentPackage.BuildDepends = line.substr(strlen("Build-Depends: "));
// Continue reading lines that start with a space or tab
while (std::getline(stream, line)) {
if (line.empty() || (!std::isspace(static_cast<unsigned char>(line[0]))))
currentPackage.BuildDepends += " " + line.substr(1);
// If the last read line is not a continuation, process it in the next iteration
if (!line.empty() && !std::isspace(static_cast<unsigned char>(line[0]))) {
stream.seekg(-static_cast<int>(line.length()) - 1, std::ios_base::cur);
if (line.find("Binary:") == 0) {
std::string binary_str;
binary_str = line.substr(strlen("Binary: "));
// Continue reading lines that start with a space or tab
while (std::getline(stream, line)) {
if (line.empty() || (!std::isspace(static_cast<unsigned char>(line[0]))))
binary_str += " " + line.substr(1);
// If the last read line is not a continuation, process it in the next iteration
if (!line.empty() && !std::isspace(static_cast<unsigned char>(line[0]))) {
stream.seekg(-static_cast<int>(line.length()) - 1, std::ios_base::cur);
currentPackage.Binary = split_string(binary_str, ", ");
// Extract Package
if (line.find("Package:") == 0) {
currentPackage.Package = line.substr(strlen("Package: "));
// Extract Provides (if any)
if (line.find("Provides:") == 0) {
std::string provides_line = line.substr(strlen("Provides: "));
// Split by commas
std::regex comma_sep_RE(R"(\s*,\s*)");
std::sregex_token_iterator provides_it(provides_line.begin(), provides_line.end(), comma_sep_RE, -1);
std::sregex_token_iterator provides_end;
for (; provides_it != provides_end; ++provides_it) {
std::string provide = provides_it->str();
// Extract the package name before any space or '('
size_t pos_space = provide.find(' ');
size_t pos_paren = provide.find('(');
size_t pos = std::string::npos;
if (pos_space != std::string::npos && pos_paren != std::string::npos) {
pos = std::min(pos_space, pos_paren);
else if (pos_space != std::string::npos) {
pos = pos_space;
else if (pos_paren != std::string::npos) {
pos = pos_paren;
if (pos != std::string::npos) {
provide = provide.substr(0, pos);
// Trim whitespace
provide.erase(provide.find_last_not_of(" \t\n\r\f\v") + 1);
provide.erase(0, provide.find_first_not_of(" \t\n\r\f\v"));
if (!provide.empty()) {
// Add the last package if the file doesn't end with a blank line
if (in_entry && !currentPackage.Package.empty()) {
// Finalize BuildDependsParsed
currentPackage.BuildDependsParsed = parse_relations(currentPackage.BuildDepends);
return packages;
// Function to download, decompress, and parse the Packages.gz data
std::optional<std::vector<PackageInfo>> fetch_and_parse_packages(const std::string& url) {
CURL* curl = curl_easy_init();
if (!curl) {
std::cerr << "Failed to initialize CURL.\n";
return std::nullopt;
std::vector<char> downloadedData;
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &downloadedData);
// Follow redirects if any
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
// Set a user agent
curl_easy_setopt(curl, CURLOPT_USERAGENT, "SourcesParser/1.0");
CURLcode res = curl_easy_perform(curl);
if (res != CURLE_OK) {
std::cerr << "CURL download error (Packages.gz): " << curl_easy_strerror(res) << "\n";
return std::nullopt;
// Initialize libarchive
struct archive* a = archive_read_new();
if (archive_read_open_memory(a,, downloadedData.size()) != ARCHIVE_OK) {
std::cerr << "Failed to open Packages.gz archive: " << archive_error_string(a) << "\n";
return std::nullopt;
struct archive_entry* entry;
std::string decompressedData;
// Read all entries (though there should typically be only one)
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
const void* buff;
size_t size;
la_int64_t offset;
while (true) {
int r = archive_read_data_block(a, &buff, &size, &offset);
if (r == ARCHIVE_EOF)
if (r != ARCHIVE_OK) {
std::cerr << "Error during decompression (Packages.gz): " << archive_error_string(a) << "\n";
return std::nullopt;
decompressedData.append(static_cast<const char*>(buff), size);
// Parse the decompressed data
std::vector<PackageInfo> packages;
std::istringstream stream(decompressedData);
std::string line;
PackageInfo currentPackage;
bool in_entry = false;
while (std::getline(stream, line)) {
if (line.empty()) {
if (in_entry && !currentPackage.Package.empty()) {
currentPackage = PackageInfo();
in_entry = false;
in_entry = true;
// Extract Package
if (line.find("Package:") == 0) {
currentPackage.Package = line.substr(strlen("Package: "));
// Extract Source
if (line.find("Source:") == 0) {
currentPackage.Source = line.substr(strlen("Source: "));
// Extract Provides
if (line.find("Provides:") == 0) {
std::string provides_line = line.substr(strlen("Provides: "));
// Split by commas
std::regex comma_sep_RE(R"(\s*,\s*)");
std::sregex_token_iterator provides_it(provides_line.begin(), provides_line.end(), comma_sep_RE, -1);
std::sregex_token_iterator provides_end;
for (; provides_it != provides_end; ++provides_it) {
std::string provide = provides_it->str();
// Extract the package name before any space or '('
size_t pos_space = provide.find(' ');
size_t pos_paren = provide.find('(');
size_t pos = std::string::npos;
if (pos_space != std::string::npos && pos_paren != std::string::npos) {
pos = std::min(pos_space, pos_paren);
else if (pos_space != std::string::npos) {
pos = pos_space;
else if (pos_paren != std::string::npos) {
pos = pos_paren;
if (pos != std::string::npos) {
provide = provide.substr(0, pos);
// Trim whitespace
provide.erase(provide.find_last_not_of(" \t\n\r\f\v") + 1);
provide.erase(0, provide.find_first_not_of(" \t\n\r\f\v"));
if (!provide.empty()) {
// Any other fields are ignored for now
// Add the last package if the file doesn't end with a blank line
if (in_entry && !currentPackage.Package.empty()) {
return packages;
std::set<std::pair<std::string, std::string>> build_dependency_graph(
const std::vector<PackageInfo>& sources,
const std::vector<PackageInfo>& binaries) {
// Map of virtual package to real binary package(s)
std::map<std::string, std::vector<std::string>> virtual_to_real;
// Set of all real binary package names
std::set<std::string> real_binary_packages;
// Map of binary package to its source package
std::map<std::string, std::string> binary_to_source;
// Populate binary_to_source mapping and virtual_to_real
for (const auto& source_pkg : sources) {
for (const auto& binary_pkg : source_pkg.Binary) {
binary_to_source[binary_pkg] = source_pkg.Package;
for (const auto& binary_pkg : binaries) {
if (binary_pkg.Source.has_value()) {
binary_to_source[binary_pkg.Package] = binary_pkg.Source.value();
// Process Provides
for (const auto& provide : binary_pkg.Provides) {
// Dependency graph as a set of edges (dependency -> package)
std::set<std::pair<std::string, std::string>> graph;
for (const auto& pkg : sources) {
if (!pkg.BuildDependsParsed.has_value())
continue; // Skip if no build dependencies
for (const auto& or_deps : pkg.BuildDependsParsed.value()) {
// For each set of alternative dependencies (logical OR)
for (const auto& dep : or_deps) {
std::string dep_name =;
// If dep.archqual exists, append it with ':'
if (dep.archqual.has_value())
dep_name += ":" + dep.archqual.value();
// If dep_name is a virtual package, map it to real binary package(s)
if (virtual_to_real.find(dep_name) != virtual_to_real.end()) {
for (const auto& real_pkg : virtual_to_real[dep_name]) {
// Map binary dependency to source package
if (binary_to_source.find(real_pkg) != binary_to_source.end()) {
std::string source_dep = binary_to_source[real_pkg];
// Avoid self-dependency
if (source_dep != pkg.Package) {
graph.emplace(source_dep, pkg.Package); // Reversed edge
else {
std::cerr << "Warning: Binary package \"" << real_pkg << "\" provided by \""
<< dep_name << "\" does not map to any source package.\n";
else if (real_binary_packages.find(dep_name) != real_binary_packages.end()) {
// Direct binary dependency
if (binary_to_source.find(dep_name) != binary_to_source.end()) {
std::string source_dep = binary_to_source[dep_name];
// Avoid self-dependency
if (source_dep != pkg.Package) {
graph.emplace(source_dep, pkg.Package); // Reversed edge
else {
std::cerr << "Warning: Binary dependency \"" << dep_name << "\" does not map to any source package.\n";
// Transitive reduction: Collect edges to remove first
std::vector<std::pair<std::string, std::string>> edges_to_remove;
// Build adjacency list from the graph
std::map<std::string, std::set<std::string>> adj;
for (const auto& edge : graph) {
for (const auto& [u, neighbors] : adj) {
for (const auto& v : neighbors) {
if (adj.find(v) != adj.end()) {
for (const auto& w : adj[v]) {
if (adj[u].find(w) != adj[u].end()) {
edges_to_remove.emplace_back(u, w);
// Now remove the collected edges
for (const auto& edge : edges_to_remove) {
return graph;
QString serialize_dependency_graph_to_json(const std::set<std::pair<std::string, std::string>>& graph) {
// Check if the graph is empty
if (graph.empty()) {
std::cerr << "Warning: Dependency graph is empty." << std::endl;
return "{}"; // Return empty JSON object
// Build adjacency list where key is dependency and value is list of packages that depend on it
std::map<std::string, QJsonArray> adjacency;
for (const auto& edge : graph) {
if (!edge.first.empty() && !edge.second.empty()) {
// Convert to QJsonObject
QJsonObject jsonObj;
for (const auto& [dep, dependents] : adjacency) {
jsonObj[QString::fromStdString(dep)] = dependents;
// Convert to JSON string
QJsonDocument doc(jsonObj);
return QString(doc.toJson(QJsonDocument::Compact));
} // namespace SourcesParser