almost finished first sitemap logic I think
This commit is contained in:
parent
c986b7feeb
commit
1340398449
12 changed files with 276 additions and 48 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
@ -3938,6 +3938,7 @@ dependencies = [
|
|||
"serde_json",
|
||||
"surf",
|
||||
"tera",
|
||||
"thiserror",
|
||||
"tinytemplate",
|
||||
"tokio",
|
||||
"tower",
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
[toolchain]
|
||||
channel = "nightly-2024-04-25"
|
||||
channel = "nightly-2024-06-20"
|
||||
## Toggle to this one for sdk releases
|
||||
# channel = "stable"
|
||||
targets = ["x86_64-unknown-linux-gnu", "wasm32-unknown-unknown"]
|
||||
|
|
|
@ -28,6 +28,7 @@ tower-http = { workspace = true, features = ["fs", "trace"] }
|
|||
surf.workspace = true
|
||||
cynic = { workspace = true, features = ["http-surf"] }
|
||||
cynic-codegen.workspace = true
|
||||
thiserror.workspace = true
|
||||
tera = { version = "1.19.1", default-features = false }
|
||||
fd-lock = "4.0.2"
|
||||
quick-xml = { version = "0.34.0", features = ["serialize"] }
|
||||
|
@ -37,6 +38,7 @@ chrono = { version = "0.4.34", features = ["serde"] }
|
|||
serde_cbor = "0.11.2"
|
||||
pico-args = "0.5.0"
|
||||
rayon = "1.10.0"
|
||||
# itertools = "0.13.0"
|
||||
|
||||
[build-dependencies]
|
||||
cynic-codegen.workspace = true
|
||||
|
|
|
@ -10,6 +10,8 @@ use saleor_app_sdk::{config::Config, manifest::AppManifest, SaleorApp};
|
|||
use serde::{Deserialize, Serialize};
|
||||
use tracing::level_filters::LevelFilter;
|
||||
|
||||
use crate::queries::event_subjects_updated::Event;
|
||||
|
||||
// Make our own error that wraps `anyhow::Error`.
|
||||
pub struct AppError(anyhow::Error);
|
||||
|
||||
|
@ -60,7 +62,7 @@ pub struct AppState {
|
|||
pub target_channel: String,
|
||||
pub sitemap_config: SitemapConfig,
|
||||
pub manifest: AppManifest,
|
||||
pub task_queue_sender: Sender<EventType>,
|
||||
pub task_queue_sender: Sender<Event>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
|
|
|
@ -31,7 +31,7 @@ use tracing::{debug, error, info};
|
|||
use crate::{
|
||||
app::{trace_to_std, AppState, SitemapConfig},
|
||||
queries::event_subjects_updated::EVENTS_QUERY,
|
||||
routes::{create_routes, register::regenerate},
|
||||
routes::create_routes,
|
||||
};
|
||||
|
||||
#[tokio::main]
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
|
|
@ -0,0 +1 @@
|
|||
|
|
@ -1,30 +1,129 @@
|
|||
use quick_xml::DeError;
|
||||
use rayon::prelude::*;
|
||||
use std::{
|
||||
fs::{read_dir, File},
|
||||
fs::{self, read_dir, File},
|
||||
io::BufReader,
|
||||
path::PathBuf,
|
||||
};
|
||||
use tinytemplate::TinyTemplate;
|
||||
|
||||
use crate::queries::event_subjects_updated::Event;
|
||||
use crate::{app::SitemapConfig, queries::event_subjects_updated::Event, sitemap::Url};
|
||||
use tokio::{sync::mpsc::Receiver, task::JoinHandle};
|
||||
use tracing::warn;
|
||||
use tracing::{debug, error, trace, warn};
|
||||
|
||||
use super::UrlSet;
|
||||
use super::{RefType, UrlSet};
|
||||
|
||||
// 10k links google says, but there's also a size limit and my custom params might be messing with
|
||||
// that? Rather split prematurely to be sure.
|
||||
const MAX_URL_IN_SET: usize = 6000;
|
||||
|
||||
pub struct EventHandler {
|
||||
receiver: Receiver<Event>,
|
||||
receiver: Receiver<(Event, SitemapConfig)>,
|
||||
}
|
||||
|
||||
impl EventHandler {
|
||||
pub fn start(receiver: Receiver<Event>) -> JoinHandle<()> {
|
||||
pub fn start(receiver: Receiver<(Event, SitemapConfig)>) -> JoinHandle<()> {
|
||||
let mut s = Self { receiver };
|
||||
tokio::spawn(s.listen())
|
||||
}
|
||||
|
||||
async fn listen(mut self) {
|
||||
while let Some(message) = self.receiver.recv().await {
|
||||
while let Some((message, sitemap_config)) = self.receiver.recv().await {
|
||||
match message {
|
||||
Event::ProductCreated(product) => {}
|
||||
Event::ProductUpdated(product) => {}
|
||||
Event::ProductUpdated(product) => {
|
||||
if let Some(product) = product.product {
|
||||
let mut url_sets = read_xmls(&sitemap_config.target_folder).await;
|
||||
let mut was_any_set_affected = false;
|
||||
|
||||
//in case no sitemaps exist yet, create first urlset
|
||||
if url_sets.is_empty() {
|
||||
let url_set = UrlSet::new();
|
||||
url_sets.push((
|
||||
url_set,
|
||||
std::path::Path::new(&format!(
|
||||
"{}/0.xml",
|
||||
sitemap_config.target_folder
|
||||
))
|
||||
.to_path_buf(),
|
||||
));
|
||||
}
|
||||
|
||||
// check if any url_sets contain affected urls
|
||||
for (set, path) in &mut url_sets {
|
||||
let mut affected_urls = set.find_urls(product.id.inner());
|
||||
|
||||
if affected_urls.len() == 0 {
|
||||
trace!("Product doesn't exist in url_set {:?}", path);
|
||||
continue;
|
||||
}
|
||||
was_any_set_affected = true;
|
||||
|
||||
// Update affected urls
|
||||
affected_urls.iter_mut().for_each(|url| {
|
||||
let mut templater = TinyTemplate::new();
|
||||
templater
|
||||
.add_template("product", &sitemap_config.product_template)
|
||||
.expect("Check your url templates!");
|
||||
let new_loc = templater
|
||||
.render("product", &product)
|
||||
.expect("Check your url templates!");
|
||||
debug!("updated `{}` to `{}`", &url.loc, new_loc);
|
||||
url.loc = new_loc;
|
||||
});
|
||||
}
|
||||
|
||||
//create product url if no set contained url with it
|
||||
if !was_any_set_affected {
|
||||
debug!("Product isn't in any sitemap, creating...");
|
||||
if let Some((last_url_set, _)) = url_sets.last_mut() {
|
||||
if product.category.is_none() {
|
||||
debug!("product missing category, hopefully not needed in url template?");
|
||||
}
|
||||
last_url_set.url.push(Url::new_with_ref(
|
||||
product.id.inner().to_owned(),
|
||||
product.slug,
|
||||
RefType::Product,
|
||||
product.category.clone().map(|c| c.id.inner().to_owned()),
|
||||
product.category.clone().map(|c| c.slug),
|
||||
Some(RefType::Category),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
let mut split_url_sets = vec![];
|
||||
//write first time, if some throw too long error, split and try in second
|
||||
//loop
|
||||
for url_set in url_sets {
|
||||
if let Err(e) = write_urlset_to_file(&url_set).await {
|
||||
match e {
|
||||
WriteUrlSetToFileErr::UrlSetTooLong(l) => {
|
||||
debug!("url set too large ({l}), splitting...");
|
||||
if let Some(mut new_url_sets) =
|
||||
split_urlset_to_new_file(url_set).await
|
||||
{
|
||||
split_url_sets.append(&mut new_url_sets);
|
||||
}
|
||||
}
|
||||
e => error!("{:?}", e),
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
//the second attempt
|
||||
for url_set in split_url_sets {
|
||||
if let Err(e) = write_urlset_to_file(&url_set).await {
|
||||
match e {
|
||||
WriteUrlSetToFileErr::UrlSetTooLong(l) => {
|
||||
error!("url set STILL too large?? ({l}), ignoring url set {:?}...", url_set);
|
||||
}
|
||||
e => error!("{:?}", e),
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
warn!("Event::ProductCreated missing product");
|
||||
}
|
||||
Event::ProductDeleted(product) => {}
|
||||
Event::CategoryCreated(category) => {}
|
||||
Event::CategoryUpdated(category) => {}
|
||||
|
@ -41,9 +140,9 @@ impl EventHandler {
|
|||
}
|
||||
}
|
||||
|
||||
async fn read_xmls() {
|
||||
let paths = read_dir(std::env::var("SITEMAP_TARGET_FOLDER").unwrap()).unwrap();
|
||||
let mut all_urls: Vec<UrlSet> = paths
|
||||
async fn read_xmls(target_folder: &str) -> Vec<(UrlSet, PathBuf)> {
|
||||
let paths = read_dir(target_folder).unwrap();
|
||||
let all_urls: Vec<(UrlSet, PathBuf)> = paths
|
||||
.into_iter()
|
||||
.par_bridge()
|
||||
.filter_map(|path| {
|
||||
|
@ -51,10 +150,79 @@ async fn read_xmls() {
|
|||
if path.path().is_file() {
|
||||
let file = File::open(path.path()).expect("Unable to open file");
|
||||
let reader = BufReader::new(file);
|
||||
return Some(quick_xml::de::from_reader(reader).unwrap());
|
||||
return Some((quick_xml::de::from_reader(reader).unwrap(), path.path()));
|
||||
}
|
||||
}
|
||||
return None;
|
||||
})
|
||||
.collect();
|
||||
all_urls
|
||||
}
|
||||
|
||||
/**
|
||||
* fails `if url_set.url.len() > MAX_URL_IN_SET`
|
||||
*/
|
||||
async fn split_urlset_to_new_file(union: (UrlSet, PathBuf)) -> Option<Vec<(UrlSet, PathBuf)>> {
|
||||
let (url_set, path) = union;
|
||||
|
||||
if url_set.url.len() < MAX_URL_IN_SET {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut was_original_file_assigned = false;
|
||||
let chunks = url_set.url.chunks(MAX_URL_IN_SET).collect::<Vec<_>>();
|
||||
|
||||
let mut file_number = path
|
||||
.file_stem()
|
||||
.unwrap()
|
||||
.to_str()
|
||||
.unwrap()
|
||||
.parse::<i32>()
|
||||
.unwrap();
|
||||
|
||||
return Some(
|
||||
chunks
|
||||
.into_iter()
|
||||
.map(|urls| {
|
||||
let folder = path.clone().parent().unwrap().to_str().unwrap().to_owned();
|
||||
|
||||
//keep incrementing file number till a file with that number is free to use
|
||||
if !was_original_file_assigned {
|
||||
was_original_file_assigned = true
|
||||
} else {
|
||||
while !std::path::Path::new(&format!("{folder}/{file_number}.xml")).exists() {
|
||||
file_number = file_number + 1;
|
||||
}
|
||||
}
|
||||
|
||||
let mut url_set = UrlSet::new();
|
||||
url_set.url = urls.into();
|
||||
(
|
||||
url_set,
|
||||
std::path::Path::new(&format!("{folder}/{file_number}.xml")).to_path_buf(),
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>(),
|
||||
);
|
||||
}
|
||||
|
||||
async fn write_urlset_to_file(
|
||||
url_set_n_path: &(UrlSet, PathBuf),
|
||||
) -> Result<(), WriteUrlSetToFileErr> {
|
||||
let (url_set, path) = url_set_n_path;
|
||||
if url_set.url.len() > MAX_URL_IN_SET {
|
||||
return Err(WriteUrlSetToFileErr::UrlSetTooLong(url_set.url.len()));
|
||||
}
|
||||
fs::write(path, &quick_xml::se::to_string(&url_set)?)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum WriteUrlSetToFileErr {
|
||||
#[error("writing error")]
|
||||
IoResult(#[from] std::io::Error),
|
||||
#[error("Url set length exeeds xml standard of 10k entries per file")]
|
||||
UrlSetTooLong(usize),
|
||||
#[error("{0}")]
|
||||
DeError(#[from] DeError),
|
||||
}
|
||||
|
|
|
@ -8,13 +8,10 @@ use chrono::{DateTime, FixedOffset, SubsecRound};
|
|||
use quick_xml::DeError;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
|
||||
const SITEMAP_XMLNS: &str = "http://sitemaps.org/schemas/sitemap/0.9";
|
||||
const SALEOR_REF_XMLNS: &str = "http://app-sitemap-generator.kremik.sk/xml-schemas/saleor-ref.xsd";
|
||||
|
||||
|
||||
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Debug)]
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Debug, Clone)]
|
||||
#[serde(rename = "urlset")]
|
||||
pub struct UrlSet {
|
||||
#[serde(rename = "@xmlns:saleor")]
|
||||
|
@ -24,28 +21,44 @@ pub struct UrlSet {
|
|||
pub url: Vec<Url>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Debug)]
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Debug, Clone)]
|
||||
pub struct Url {
|
||||
pub loc: String,
|
||||
pub lastmod: DateTime<FixedOffset>,
|
||||
#[serde(rename = "saleor:ref")]
|
||||
pub saleor_ref: SaleorRef,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Debug, Clone)]
|
||||
pub enum RefType {
|
||||
Product,
|
||||
|
||||
Category,
|
||||
Collection,
|
||||
Page,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Debug)]
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Debug, Clone)]
|
||||
pub struct SaleorRef {
|
||||
#[serde(rename = "saleor:id")]
|
||||
pub id: String,
|
||||
#[serde(rename = "saleor:type")]
|
||||
pub typ: String,
|
||||
#[serde(rename = "saleor:category-id")]
|
||||
pub typ: RefType,
|
||||
/**
|
||||
Related items come first in url, if present. eg:
|
||||
site.com/{page} : typ = RefType::Page
|
||||
site.com/{category}/{product} : typ= Product, related_typ: Category
|
||||
*/
|
||||
#[serde(rename = "saleor:related-id")]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub category_id: Option<String>,
|
||||
pub
|
||||
pub related_id: Option<String>,
|
||||
/**
|
||||
Related items come first in url, if present. eg:
|
||||
site.com/{page} : typ = RefType::Page
|
||||
site.com/{category}/{product} : typ= Product, related_typ: Category
|
||||
*/
|
||||
#[serde(rename = "saleor:related-typ")]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub related_typ: Option<RefType>,
|
||||
}
|
||||
|
||||
impl UrlSet {
|
||||
|
@ -76,14 +89,25 @@ impl UrlSet {
|
|||
url: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
pub fn find_urls(&mut self, id: &str) -> Vec<&mut Url> {
|
||||
self.url
|
||||
.iter_mut()
|
||||
.filter(|url| {
|
||||
url.saleor_ref.id == id || url.saleor_ref.related_id == Some(id.to_owned())
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
impl Url {
|
||||
pub fn new_generic_url(id: String, slug: String) -> Self {
|
||||
pub fn new(id: String, slug: String, typ: RefType) -> Self {
|
||||
Self {
|
||||
saleor_ref: SaleorRef {
|
||||
product_id: None,
|
||||
category_id: Some(id),
|
||||
id,
|
||||
typ,
|
||||
related_id: None,
|
||||
related_typ: None,
|
||||
},
|
||||
lastmod: chrono::offset::Utc::now().fixed_offset().round_subsecs(1),
|
||||
// Have template string determine the url
|
||||
|
@ -91,20 +115,35 @@ impl Url {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn new_product_url(
|
||||
category_id: String,
|
||||
product_id: String,
|
||||
category_slug: String,
|
||||
product_slug: String,
|
||||
/**
|
||||
For exaple: product/category, product/collection
|
||||
*/
|
||||
pub fn new_with_ref(
|
||||
id: String,
|
||||
slug: String,
|
||||
typ: RefType,
|
||||
related_id: Option<String>,
|
||||
related_slug: Option<String>,
|
||||
related_typ: Option<RefType>,
|
||||
) -> Self {
|
||||
let loc = match related_slug {
|
||||
Some(r_s) => {
|
||||
format!("https://example.com/{r_s}/{slug}")
|
||||
}
|
||||
None => {
|
||||
format!("https://example.com/{slug}")
|
||||
}
|
||||
};
|
||||
Self {
|
||||
// Have template string determine the url
|
||||
loc: format!("https://example.com/{category_slug}/{product_slug}"),
|
||||
lastmod: chrono::offset::Utc::now().fixed_offset().round_subsecs(1),
|
||||
saleor_ref: SaleorRef {
|
||||
product_id: Some(product_id),
|
||||
category_id: Some(category_id),
|
||||
id,
|
||||
typ,
|
||||
related_id,
|
||||
related_typ,
|
||||
},
|
||||
lastmod: chrono::offset::Utc::now().fixed_offset().round_subsecs(1),
|
||||
// Have template string determine the url
|
||||
loc,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
|
|
@ -0,0 +1 @@
|
|||
|
|
@ -1,23 +1,35 @@
|
|||
#[cfg(test)]
|
||||
mod test {
|
||||
use crate::sitemap::{Url, UrlSet};
|
||||
use crate::sitemap::{RefType, Url, UrlSet};
|
||||
|
||||
fn urlset_serialisation_isnt_lossy() {
|
||||
let mut url_set = UrlSet::new();
|
||||
url_set.url.append(&mut vec![
|
||||
Url::new_generic_url("category1coolid".to_string(), "category1".to_string()),
|
||||
Url::new_generic_url("category2coolid".to_string(), "category2".to_string()),
|
||||
Url::new_product_url(
|
||||
Url::new(
|
||||
"category1coolid".to_string(),
|
||||
"category1".to_string(),
|
||||
"product1coolid".to_string(),
|
||||
"product1".to_string(),
|
||||
RefType::Category,
|
||||
),
|
||||
Url::new_product_url(
|
||||
Url::new(
|
||||
"Collection1".to_string(),
|
||||
"Collection1coolid".to_string(),
|
||||
RefType::Collection,
|
||||
),
|
||||
Url::new_with_ref(
|
||||
"category1coolid".to_string(),
|
||||
"category1".to_string(),
|
||||
RefType::Product,
|
||||
Some("product1coolid".to_string()),
|
||||
Some("product1".to_string()),
|
||||
Some(RefType::Category),
|
||||
),
|
||||
Url::new_with_ref(
|
||||
"category2coolid".to_string(),
|
||||
"category2".to_string(),
|
||||
"product2coolid".to_string(),
|
||||
"product2".to_string(),
|
||||
RefType::Product,
|
||||
Some("product2coolid".to_string()),
|
||||
Some("product2".to_string()),
|
||||
Some(RefType::Category),
|
||||
),
|
||||
]);
|
||||
let file_str = url_set.to_file().unwrap();
|
||||
|
|
Loading…
Reference in a new issue