almost finished first sitemap logic I think

This commit is contained in:
djkato 2024-07-08 20:20:36 +02:00
parent c986b7feeb
commit 1340398449
12 changed files with 276 additions and 48 deletions

1
Cargo.lock generated
View file

@ -3938,6 +3938,7 @@ dependencies = [
"serde_json", "serde_json",
"surf", "surf",
"tera", "tera",
"thiserror",
"tinytemplate", "tinytemplate",
"tokio", "tokio",
"tower", "tower",

View file

@ -1,5 +1,5 @@
[toolchain] [toolchain]
channel = "nightly-2024-04-25" channel = "nightly-2024-06-20"
## Toggle to this one for sdk releases ## Toggle to this one for sdk releases
# channel = "stable" # channel = "stable"
targets = ["x86_64-unknown-linux-gnu", "wasm32-unknown-unknown"] targets = ["x86_64-unknown-linux-gnu", "wasm32-unknown-unknown"]

View file

@ -28,6 +28,7 @@ tower-http = { workspace = true, features = ["fs", "trace"] }
surf.workspace = true surf.workspace = true
cynic = { workspace = true, features = ["http-surf"] } cynic = { workspace = true, features = ["http-surf"] }
cynic-codegen.workspace = true cynic-codegen.workspace = true
thiserror.workspace = true
tera = { version = "1.19.1", default-features = false } tera = { version = "1.19.1", default-features = false }
fd-lock = "4.0.2" fd-lock = "4.0.2"
quick-xml = { version = "0.34.0", features = ["serialize"] } quick-xml = { version = "0.34.0", features = ["serialize"] }
@ -37,6 +38,7 @@ chrono = { version = "0.4.34", features = ["serde"] }
serde_cbor = "0.11.2" serde_cbor = "0.11.2"
pico-args = "0.5.0" pico-args = "0.5.0"
rayon = "1.10.0" rayon = "1.10.0"
# itertools = "0.13.0"
[build-dependencies] [build-dependencies]
cynic-codegen.workspace = true cynic-codegen.workspace = true

View file

@ -10,6 +10,8 @@ use saleor_app_sdk::{config::Config, manifest::AppManifest, SaleorApp};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use tracing::level_filters::LevelFilter; use tracing::level_filters::LevelFilter;
use crate::queries::event_subjects_updated::Event;
// Make our own error that wraps `anyhow::Error`. // Make our own error that wraps `anyhow::Error`.
pub struct AppError(anyhow::Error); pub struct AppError(anyhow::Error);
@ -60,7 +62,7 @@ pub struct AppState {
pub target_channel: String, pub target_channel: String,
pub sitemap_config: SitemapConfig, pub sitemap_config: SitemapConfig,
pub manifest: AppManifest, pub manifest: AppManifest,
pub task_queue_sender: Sender<EventType>, pub task_queue_sender: Sender<Event>,
} }
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]

View file

@ -31,7 +31,7 @@ use tracing::{debug, error, info};
use crate::{ use crate::{
app::{trace_to_std, AppState, SitemapConfig}, app::{trace_to_std, AppState, SitemapConfig},
queries::event_subjects_updated::EVENTS_QUERY, queries::event_subjects_updated::EVENTS_QUERY,
routes::{create_routes, register::regenerate}, routes::create_routes,
}; };
#[tokio::main] #[tokio::main]

View file

@ -0,0 +1 @@

View file

@ -1,30 +1,129 @@
use quick_xml::DeError;
use rayon::prelude::*; use rayon::prelude::*;
use std::{ use std::{
fs::{read_dir, File}, fs::{self, read_dir, File},
io::BufReader, io::BufReader,
path::PathBuf,
}; };
use tinytemplate::TinyTemplate;
use crate::queries::event_subjects_updated::Event; use crate::{app::SitemapConfig, queries::event_subjects_updated::Event, sitemap::Url};
use tokio::{sync::mpsc::Receiver, task::JoinHandle}; use tokio::{sync::mpsc::Receiver, task::JoinHandle};
use tracing::warn; use tracing::{debug, error, trace, warn};
use super::UrlSet; use super::{RefType, UrlSet};
// 10k links google says, but there's also a size limit and my custom params might be messing with
// that? Rather split prematurely to be sure.
const MAX_URL_IN_SET: usize = 6000;
pub struct EventHandler { pub struct EventHandler {
receiver: Receiver<Event>, receiver: Receiver<(Event, SitemapConfig)>,
} }
impl EventHandler { impl EventHandler {
pub fn start(receiver: Receiver<Event>) -> JoinHandle<()> { pub fn start(receiver: Receiver<(Event, SitemapConfig)>) -> JoinHandle<()> {
let mut s = Self { receiver }; let mut s = Self { receiver };
tokio::spawn(s.listen()) tokio::spawn(s.listen())
} }
async fn listen(mut self) { async fn listen(mut self) {
while let Some(message) = self.receiver.recv().await { while let Some((message, sitemap_config)) = self.receiver.recv().await {
match message { match message {
Event::ProductCreated(product) => {} Event::ProductCreated(product) => {}
Event::ProductUpdated(product) => {} Event::ProductUpdated(product) => {
if let Some(product) = product.product {
let mut url_sets = read_xmls(&sitemap_config.target_folder).await;
let mut was_any_set_affected = false;
//in case no sitemaps exist yet, create first urlset
if url_sets.is_empty() {
let url_set = UrlSet::new();
url_sets.push((
url_set,
std::path::Path::new(&format!(
"{}/0.xml",
sitemap_config.target_folder
))
.to_path_buf(),
));
}
// check if any url_sets contain affected urls
for (set, path) in &mut url_sets {
let mut affected_urls = set.find_urls(product.id.inner());
if affected_urls.len() == 0 {
trace!("Product doesn't exist in url_set {:?}", path);
continue;
}
was_any_set_affected = true;
// Update affected urls
affected_urls.iter_mut().for_each(|url| {
let mut templater = TinyTemplate::new();
templater
.add_template("product", &sitemap_config.product_template)
.expect("Check your url templates!");
let new_loc = templater
.render("product", &product)
.expect("Check your url templates!");
debug!("updated `{}` to `{}`", &url.loc, new_loc);
url.loc = new_loc;
});
}
//create product url if no set contained url with it
if !was_any_set_affected {
debug!("Product isn't in any sitemap, creating...");
if let Some((last_url_set, _)) = url_sets.last_mut() {
if product.category.is_none() {
debug!("product missing category, hopefully not needed in url template?");
}
last_url_set.url.push(Url::new_with_ref(
product.id.inner().to_owned(),
product.slug,
RefType::Product,
product.category.clone().map(|c| c.id.inner().to_owned()),
product.category.clone().map(|c| c.slug),
Some(RefType::Category),
));
}
}
let mut split_url_sets = vec![];
//write first time, if some throw too long error, split and try in second
//loop
for url_set in url_sets {
if let Err(e) = write_urlset_to_file(&url_set).await {
match e {
WriteUrlSetToFileErr::UrlSetTooLong(l) => {
debug!("url set too large ({l}), splitting...");
if let Some(mut new_url_sets) =
split_urlset_to_new_file(url_set).await
{
split_url_sets.append(&mut new_url_sets);
}
}
e => error!("{:?}", e),
}
};
}
//the second attempt
for url_set in split_url_sets {
if let Err(e) = write_urlset_to_file(&url_set).await {
match e {
WriteUrlSetToFileErr::UrlSetTooLong(l) => {
error!("url set STILL too large?? ({l}), ignoring url set {:?}...", url_set);
}
e => error!("{:?}", e),
}
};
}
}
warn!("Event::ProductCreated missing product");
}
Event::ProductDeleted(product) => {} Event::ProductDeleted(product) => {}
Event::CategoryCreated(category) => {} Event::CategoryCreated(category) => {}
Event::CategoryUpdated(category) => {} Event::CategoryUpdated(category) => {}
@ -41,9 +140,9 @@ impl EventHandler {
} }
} }
async fn read_xmls() { async fn read_xmls(target_folder: &str) -> Vec<(UrlSet, PathBuf)> {
let paths = read_dir(std::env::var("SITEMAP_TARGET_FOLDER").unwrap()).unwrap(); let paths = read_dir(target_folder).unwrap();
let mut all_urls: Vec<UrlSet> = paths let all_urls: Vec<(UrlSet, PathBuf)> = paths
.into_iter() .into_iter()
.par_bridge() .par_bridge()
.filter_map(|path| { .filter_map(|path| {
@ -51,10 +150,79 @@ async fn read_xmls() {
if path.path().is_file() { if path.path().is_file() {
let file = File::open(path.path()).expect("Unable to open file"); let file = File::open(path.path()).expect("Unable to open file");
let reader = BufReader::new(file); let reader = BufReader::new(file);
return Some(quick_xml::de::from_reader(reader).unwrap()); return Some((quick_xml::de::from_reader(reader).unwrap(), path.path()));
} }
} }
return None; return None;
}) })
.collect(); .collect();
all_urls
}
/**
* fails `if url_set.url.len() > MAX_URL_IN_SET`
*/
async fn split_urlset_to_new_file(union: (UrlSet, PathBuf)) -> Option<Vec<(UrlSet, PathBuf)>> {
let (url_set, path) = union;
if url_set.url.len() < MAX_URL_IN_SET {
return None;
}
let mut was_original_file_assigned = false;
let chunks = url_set.url.chunks(MAX_URL_IN_SET).collect::<Vec<_>>();
let mut file_number = path
.file_stem()
.unwrap()
.to_str()
.unwrap()
.parse::<i32>()
.unwrap();
return Some(
chunks
.into_iter()
.map(|urls| {
let folder = path.clone().parent().unwrap().to_str().unwrap().to_owned();
//keep incrementing file number till a file with that number is free to use
if !was_original_file_assigned {
was_original_file_assigned = true
} else {
while !std::path::Path::new(&format!("{folder}/{file_number}.xml")).exists() {
file_number = file_number + 1;
}
}
let mut url_set = UrlSet::new();
url_set.url = urls.into();
(
url_set,
std::path::Path::new(&format!("{folder}/{file_number}.xml")).to_path_buf(),
)
})
.collect::<Vec<_>>(),
);
}
async fn write_urlset_to_file(
url_set_n_path: &(UrlSet, PathBuf),
) -> Result<(), WriteUrlSetToFileErr> {
let (url_set, path) = url_set_n_path;
if url_set.url.len() > MAX_URL_IN_SET {
return Err(WriteUrlSetToFileErr::UrlSetTooLong(url_set.url.len()));
}
fs::write(path, &quick_xml::se::to_string(&url_set)?)?;
Ok(())
}
#[derive(thiserror::Error, Debug)]
pub enum WriteUrlSetToFileErr {
#[error("writing error")]
IoResult(#[from] std::io::Error),
#[error("Url set length exeeds xml standard of 10k entries per file")]
UrlSetTooLong(usize),
#[error("{0}")]
DeError(#[from] DeError),
} }

View file

@ -8,13 +8,10 @@ use chrono::{DateTime, FixedOffset, SubsecRound};
use quick_xml::DeError; use quick_xml::DeError;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
const SITEMAP_XMLNS: &str = "http://sitemaps.org/schemas/sitemap/0.9"; const SITEMAP_XMLNS: &str = "http://sitemaps.org/schemas/sitemap/0.9";
const SALEOR_REF_XMLNS: &str = "http://app-sitemap-generator.kremik.sk/xml-schemas/saleor-ref.xsd"; const SALEOR_REF_XMLNS: &str = "http://app-sitemap-generator.kremik.sk/xml-schemas/saleor-ref.xsd";
#[derive(Serialize, Deserialize, PartialEq, Eq, Debug, Clone)]
#[derive(Serialize, Deserialize, PartialEq, Eq, Debug)]
#[serde(rename = "urlset")] #[serde(rename = "urlset")]
pub struct UrlSet { pub struct UrlSet {
#[serde(rename = "@xmlns:saleor")] #[serde(rename = "@xmlns:saleor")]
@ -24,28 +21,44 @@ pub struct UrlSet {
pub url: Vec<Url>, pub url: Vec<Url>,
} }
#[derive(Serialize, Deserialize, PartialEq, Eq, Debug)] #[derive(Serialize, Deserialize, PartialEq, Eq, Debug, Clone)]
pub struct Url { pub struct Url {
pub loc: String, pub loc: String,
pub lastmod: DateTime<FixedOffset>, pub lastmod: DateTime<FixedOffset>,
#[serde(rename = "saleor:ref")] #[serde(rename = "saleor:ref")]
pub saleor_ref: SaleorRef, pub saleor_ref: SaleorRef,
} }
#[derive(Serialize, Deserialize, PartialEq, Eq, Debug, Clone)]
pub enum RefType { pub enum RefType {
Product, Product,
Category,
Collection,
Page,
} }
#[derive(Serialize, Deserialize, PartialEq, Eq, Debug)] #[derive(Serialize, Deserialize, PartialEq, Eq, Debug, Clone)]
pub struct SaleorRef { pub struct SaleorRef {
#[serde(rename = "saleor:id")] #[serde(rename = "saleor:id")]
pub id: String, pub id: String,
#[serde(rename = "saleor:type")] #[serde(rename = "saleor:type")]
pub typ: String, pub typ: RefType,
#[serde(rename = "saleor:category-id")] /**
Related items come first in url, if present. eg:
site.com/{page} : typ = RefType::Page
site.com/{category}/{product} : typ= Product, related_typ: Category
*/
#[serde(rename = "saleor:related-id")]
#[serde(skip_serializing_if = "Option::is_none")] #[serde(skip_serializing_if = "Option::is_none")]
pub category_id: Option<String>, pub related_id: Option<String>,
pub /**
Related items come first in url, if present. eg:
site.com/{page} : typ = RefType::Page
site.com/{category}/{product} : typ= Product, related_typ: Category
*/
#[serde(rename = "saleor:related-typ")]
#[serde(skip_serializing_if = "Option::is_none")]
pub related_typ: Option<RefType>,
} }
impl UrlSet { impl UrlSet {
@ -76,14 +89,25 @@ impl UrlSet {
url: vec![], url: vec![],
} }
} }
pub fn find_urls(&mut self, id: &str) -> Vec<&mut Url> {
self.url
.iter_mut()
.filter(|url| {
url.saleor_ref.id == id || url.saleor_ref.related_id == Some(id.to_owned())
})
.collect()
}
} }
impl Url { impl Url {
pub fn new_generic_url(id: String, slug: String) -> Self { pub fn new(id: String, slug: String, typ: RefType) -> Self {
Self { Self {
saleor_ref: SaleorRef { saleor_ref: SaleorRef {
product_id: None, id,
category_id: Some(id), typ,
related_id: None,
related_typ: None,
}, },
lastmod: chrono::offset::Utc::now().fixed_offset().round_subsecs(1), lastmod: chrono::offset::Utc::now().fixed_offset().round_subsecs(1),
// Have template string determine the url // Have template string determine the url
@ -91,20 +115,35 @@ impl Url {
} }
} }
pub fn new_product_url( /**
category_id: String, For exaple: product/category, product/collection
product_id: String, */
category_slug: String, pub fn new_with_ref(
product_slug: String, id: String,
slug: String,
typ: RefType,
related_id: Option<String>,
related_slug: Option<String>,
related_typ: Option<RefType>,
) -> Self { ) -> Self {
let loc = match related_slug {
Some(r_s) => {
format!("https://example.com/{r_s}/{slug}")
}
None => {
format!("https://example.com/{slug}")
}
};
Self { Self {
// Have template string determine the url
loc: format!("https://example.com/{category_slug}/{product_slug}"),
lastmod: chrono::offset::Utc::now().fixed_offset().round_subsecs(1),
saleor_ref: SaleorRef { saleor_ref: SaleorRef {
product_id: Some(product_id), id,
category_id: Some(category_id), typ,
related_id,
related_typ,
}, },
lastmod: chrono::offset::Utc::now().fixed_offset().round_subsecs(1),
// Have template string determine the url
loc,
} }
} }
} }

View file

@ -0,0 +1 @@

View file

@ -0,0 +1 @@

View file

@ -1,23 +1,35 @@
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use crate::sitemap::{Url, UrlSet}; use crate::sitemap::{RefType, Url, UrlSet};
fn urlset_serialisation_isnt_lossy() { fn urlset_serialisation_isnt_lossy() {
let mut url_set = UrlSet::new(); let mut url_set = UrlSet::new();
url_set.url.append(&mut vec![ url_set.url.append(&mut vec![
Url::new_generic_url("category1coolid".to_string(), "category1".to_string()), Url::new(
Url::new_generic_url("category2coolid".to_string(), "category2".to_string()),
Url::new_product_url(
"category1coolid".to_string(), "category1coolid".to_string(),
"category1".to_string(), "category1".to_string(),
"product1coolid".to_string(), RefType::Category,
"product1".to_string(),
), ),
Url::new_product_url( Url::new(
"Collection1".to_string(),
"Collection1coolid".to_string(),
RefType::Collection,
),
Url::new_with_ref(
"category1coolid".to_string(),
"category1".to_string(),
RefType::Product,
Some("product1coolid".to_string()),
Some("product1".to_string()),
Some(RefType::Category),
),
Url::new_with_ref(
"category2coolid".to_string(), "category2coolid".to_string(),
"category2".to_string(), "category2".to_string(),
"product2coolid".to_string(), RefType::Product,
"product2".to_string(), Some("product2coolid".to_string()),
Some("product2".to_string()),
Some(RefType::Category),
), ),
]); ]);
let file_str = url_set.to_file().unwrap(); let file_str = url_set.to_file().unwrap();