From d3261bf95075d2b1aeea8e4e555236b47c36dd49 Mon Sep 17 00:00:00 2001 From: Edgar Date: Tue, 20 Apr 2021 16:47:43 +0200 Subject: [PATCH] upd 0.4 --- Cargo.toml | 6 +- README.md | 97 +++++++--------- examples/gen_sitemap.rs | 91 +++++++-------- src/lib.rs | 237 +++++++++++++++++++++++----------------- 4 files changed, 225 insertions(+), 206 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index cf26270..e3cefc8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sitewriter" -version = "0.3.2" +version = "0.4.0" authors = ["Edgar "] edition = "2018" description = "A sitemap writing library." @@ -13,4 +13,6 @@ categories = ["parsing"] [dependencies] chrono = "0.4.19" -quick-xml = "0.21.0" +derive_builder = "0.10.0" +quick-xml = "0.22.0" +url = "2.2.1" diff --git a/README.md b/README.md index 614b868..6fdc50c 100644 --- a/README.md +++ b/README.md @@ -13,77 +13,56 @@ It uses the [quick-xml](https://github.com/tafia/quick-xml) so it should be fast To run the examples use `cargo run --example gen_sitemap` ```rust -use chrono::prelude::*; -use sitewriter::*; + use chrono::prelude::*; + use sitewriter::*; -fn main() { - let mut sitemap = Sitemap::new(); - sitemap.urls.push(Url::new("https://edgarluque.com/projects")); - - sitemap.urls.push(Url { - loc: "https://edgarluque.com/", +let urls = vec![ + UrlEntryBuilder::default() + .loc("https://edgarluque.com/projects".parse().unwrap()) + .build() + .unwrap(), + UrlEntry { + loc: "https://edgarluque.com/".parse().unwrap(), changefreq: Some(ChangeFreq::Daily), priority: Some(1.0), lastmod: Some(Utc::now()), - }); - - sitemap.urls.push(Url { - loc: "https://edgarluque.com/blog", + }, + UrlEntry { + loc: "https://edgarluque.com/blog".parse().unwrap(), changefreq: Some(ChangeFreq::Weekly), priority: Some(0.8), lastmod: Some(Utc::now()), - }); - - sitemap.urls.push(Url { - loc: "https://edgarluque.com/blog/sitewriter", + }, + UrlEntry { + loc: "https://edgarluque.com/blog/sitewriter".parse().unwrap(), changefreq: Some(ChangeFreq::Never), priority: Some(0.5), lastmod: Some(Utc.ymd(2020, 11, 22).and_hms(15, 10, 15)), - }); - - sitemap.urls.push(Url { - loc: "https://edgarluque.com/blog/some-future-post", + }, + UrlEntry { + loc: "https://edgarluque.com/blog/some-future-post" + .parse() + .unwrap(), changefreq: Some(ChangeFreq::Never), priority: Some(0.5), - lastmod: Some(Utc.from_utc_datetime(&Local.ymd(2020, 12, 5).and_hms(12, 30, 0).naive_utc())), - }); + lastmod: Some( + Utc.from_utc_datetime(&Local.ymd(2020, 12, 5).and_hms(12, 30, 0).naive_utc()), + ), + }, + // Entity escaping + UrlEntry { + loc: "https://edgarluque.com/blog/test&id=''" + .parse() + .unwrap(), + changefreq: Some(ChangeFreq::Never), + priority: Some(0.5), + lastmod: Some( + Utc.from_utc_datetime(&Local.ymd(2020, 12, 5).and_hms(12, 30, 0).naive_utc()), + ), + }, +]; - - let result = sitemap.into_str(); - println!("{}", result); -} +let result = Sitemap::into_str(&urls).unwrap(); +println!("{}", result); ``` -Prints the following: -```xml - - - - https://edgarluque.com/projects - - - https://edgarluque.com/ - 2020-11-22T14:36:30Z - 1.0 - daily - - - https://edgarluque.com/blog - 2020-11-22T14:36:30Z - 0.8 - weekly - - - https://edgarluque.com/blog/sitewriter - 2020-11-22T15:10:15Z - 0.5 - never - - - https://edgarluque.com/blog/some-future-post - 2020-12-05T11:30:00Z - 0.5 - never - - -``` diff --git a/examples/gen_sitemap.rs b/examples/gen_sitemap.rs index 1c8836b..8261e4c 100644 --- a/examples/gen_sitemap.rs +++ b/examples/gen_sitemap.rs @@ -2,51 +2,52 @@ use chrono::prelude::*; use sitewriter::*; fn main() { - let mut sitemap = Sitemap::new(); - sitemap - .urls - .push(Url::new("https://edgarluque.com/projects")); + let urls = vec![ + UrlEntryBuilder::default() + .loc("https://edgarluque.com/projects".parse().unwrap()) + .build() + .unwrap(), + UrlEntry { + loc: "https://edgarluque.com/".parse().unwrap(), + changefreq: Some(ChangeFreq::Daily), + priority: Some(1.0), + lastmod: Some(Utc::now()), + }, + UrlEntry { + loc: "https://edgarluque.com/blog".parse().unwrap(), + changefreq: Some(ChangeFreq::Weekly), + priority: Some(0.8), + lastmod: Some(Utc::now()), + }, + UrlEntry { + loc: "https://edgarluque.com/blog/sitewriter".parse().unwrap(), + changefreq: Some(ChangeFreq::Never), + priority: Some(0.5), + lastmod: Some(Utc.ymd(2020, 11, 22).and_hms(15, 10, 15)), + }, + UrlEntry { + loc: "https://edgarluque.com/blog/some-future-post" + .parse() + .unwrap(), + changefreq: Some(ChangeFreq::Never), + priority: Some(0.5), + lastmod: Some( + Utc.from_utc_datetime(&Local.ymd(2020, 12, 5).and_hms(12, 30, 0).naive_utc()), + ), + }, + // Entity escaping + UrlEntry { + loc: "https://edgarluque.com/blog/test&id=''" + .parse() + .unwrap(), + changefreq: Some(ChangeFreq::Never), + priority: Some(0.5), + lastmod: Some( + Utc.from_utc_datetime(&Local.ymd(2020, 12, 5).and_hms(12, 30, 0).naive_utc()), + ), + }, + ]; - sitemap.urls.push(Url { - loc: "https://edgarluque.com/", - changefreq: Some(ChangeFreq::Daily), - priority: Some(1.0), - lastmod: Some(Utc::now()), - }); - - sitemap.urls.push(Url { - loc: "https://edgarluque.com/blog", - changefreq: Some(ChangeFreq::Weekly), - priority: Some(0.8), - lastmod: Some(Utc::now()), - }); - - sitemap.urls.push(Url { - loc: "https://edgarluque.com/blog/sitewriter", - changefreq: Some(ChangeFreq::Never), - priority: Some(0.5), - lastmod: Some(Utc.ymd(2020, 11, 22).and_hms(15, 10, 15)), - }); - - sitemap.urls.push(Url { - loc: "https://edgarluque.com/blog/some-future-post", - changefreq: Some(ChangeFreq::Never), - priority: Some(0.5), - lastmod: Some( - Utc.from_utc_datetime(&Local.ymd(2020, 12, 5).and_hms(12, 30, 0).naive_utc()), - ), - }); - - // Entity escaping - sitemap.urls.push(Url { - loc: "https://edgarluque.com/blog/test&id=''", - changefreq: Some(ChangeFreq::Never), - priority: Some(0.5), - lastmod: Some( - Utc.from_utc_datetime(&Local.ymd(2020, 12, 5).and_hms(12, 30, 0).naive_utc()), - ), - }); - - let result = sitemap.into_str(); + let result = Sitemap::into_str(&urls).unwrap(); println!("{}", result); } diff --git a/src/lib.rs b/src/lib.rs index 559dcbb..6f74345 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,43 +8,59 @@ //! use chrono::prelude::*; //! use sitewriter::*; //! -//! let mut sitemap = Sitemap::new(); -//! sitemap.urls.push(Url::new("https://edgarluque.com/projects")); +//! let urls = vec![ +//! UrlEntryBuilder::default() +//! .loc("https://edgarluque.com/projects".parse().unwrap()) +//! .build() +//! .unwrap(), +//! UrlEntry { +//! loc: "https://edgarluque.com/".parse().unwrap(), +//! changefreq: Some(ChangeFreq::Daily), +//! priority: Some(1.0), +//! lastmod: Some(Utc::now()), +//! }, +//! UrlEntry { +//! loc: "https://edgarluque.com/blog".parse().unwrap(), +//! changefreq: Some(ChangeFreq::Weekly), +//! priority: Some(0.8), +//! lastmod: Some(Utc::now()), +//! }, +//! UrlEntry { +//! loc: "https://edgarluque.com/blog/sitewriter".parse().unwrap(), +//! changefreq: Some(ChangeFreq::Never), +//! priority: Some(0.5), +//! lastmod: Some(Utc.ymd(2020, 11, 22).and_hms(15, 10, 15)), +//! }, +//! UrlEntry { +//! loc: "https://edgarluque.com/blog/some-future-post" +//! .parse() +//! .unwrap(), +//! changefreq: Some(ChangeFreq::Never), +//! priority: Some(0.5), +//! lastmod: Some( +//! Utc.from_utc_datetime(&Local.ymd(2020, 12, 5).and_hms(12, 30, 0).naive_utc()), +//! ), +//! }, +//! // Entity escaping +//! UrlEntry { +//! loc: "https://edgarluque.com/blog/test&id=''" +//! .parse() +//! .unwrap(), +//! changefreq: Some(ChangeFreq::Never), +//! priority: Some(0.5), +//! lastmod: Some( +//! Utc.from_utc_datetime(&Local.ymd(2020, 12, 5).and_hms(12, 30, 0).naive_utc()), +//! ), +//! }, +//! ]; //! -//! sitemap.urls.push(Url { -//! loc: "https://edgarluque.com/", -//! changefreq: Some(ChangeFreq::Daily), -//! priority: Some(1.0), -//! lastmod: Some(Utc::now()), -//! }); -//! -//! sitemap.urls.push(Url { -//! loc: "https://edgarluque.com/blog", -//! changefreq: Some(ChangeFreq::Weekly), -//! priority: Some(0.8), -//! lastmod: Some(Utc::now()), -//! }); -//! -//! sitemap.urls.push(Url { -//! loc: "https://edgarluque.com/blog/sitewriter", -//! changefreq: Some(ChangeFreq::Never), -//! priority: Some(0.5), -//! lastmod: Some(Utc.ymd(2020, 11, 22).and_hms(15, 10, 15)), -//! }); -//! -//! sitemap.urls.push(Url { -//! loc: "https://edgarluque.com/blog/some-future-post", -//! changefreq: Some(ChangeFreq::Never), -//! priority: Some(0.5), -//! lastmod: Some(Utc.from_utc_datetime(&Local.ymd(2020, 12, 5).and_hms(12, 30, 0).naive_utc())), -//! }); -//! -//! -//! let result = sitemap.into_str(); -//! println!("{}", result); +//! let result = Sitemap::into_str(&urls).unwrap(); +//! println!("{}", result); //! ``` use chrono::{DateTime, SecondsFormat, Utc}; +use derive_builder::Builder; +use url::Url; use quick_xml::{ events::{BytesDecl, BytesEnd, BytesStart, BytesText, Event}, @@ -55,7 +71,7 @@ use std::fmt::Display; use std::io::Cursor; /// How frequently the page is likely to change. This value provides general information to search engines and may not correlate exactly to how often they crawl the page. -#[derive(Debug, Eq, PartialEq)] +#[derive(Debug, Eq, PartialEq, Copy, Clone)] pub enum ChangeFreq { /// Changes each time it's accessed. Always, @@ -89,42 +105,50 @@ impl Display for ChangeFreq { } /// A sitemap url entry. -#[derive(Debug)] -pub struct Url<'a> { +#[derive(Debug, Clone, Builder)] +#[builder(setter(strip_option))] +pub struct UrlEntry { /// URL of the page. /// /// This URL must begin with the protocol (such as http) and end with a trailing slash, if your web server requires it. This value must be less than 2,048 characters. - pub loc: &'a str, + pub loc: Url, /// The date of last modification of the file. + #[builder(default)] pub lastmod: Option>, /// How frequently the page is likely to change. + #[builder(default)] pub changefreq: Option, /// The priority of this URL relative to other URLs on your site. Valid values range from 0.0 to 1.0. /// /// This value does not affect how your pages are compared to pages on other sites—it only lets the search engines know which pages you deem most important for the crawlers. + #[builder(default)] pub priority: Option, } -impl<'a> Url<'a> { - /// Creates a url (sitemap entry) with only the required elements. - pub fn new(loc: &'a str) -> Self { +impl UrlEntry { + pub fn new( + loc: Url, + lastmod: Option>, + changefreq: Option, + priority: Option, + ) -> Self { Self { loc, - lastmod: None, - changefreq: None, - priority: None, + lastmod, + changefreq, + priority, } } } -/// Struct to hold the sitemap information. +/// Struct that implements the sitemap generation function. #[derive(Debug)] -pub struct Sitemap<'a> { - /// The list of url entries. - pub urls: Vec>, -} +pub struct Sitemap; -fn write_tag(writer: &mut Writer, tag: &str, text: &str) { +fn write_tag(writer: &mut Writer, tag: &str, text: &str) +where + T: std::io::Write, +{ writer .write_event(Event::Start(BytesStart::borrowed_name(tag.as_bytes()))) .expect(&format!("error opening {}", tag)); @@ -136,16 +160,12 @@ fn write_tag(writer: &mut Writer, tag: &str, text: &str) { .expect(&format!("error opening {}", tag)); } -impl<'a> Sitemap<'a> { - /// Create a new sitemap. - pub fn new() -> Self { - Self { urls: Vec::new() } - } - - /// Generates the sitemap using the provided writer. +impl Sitemap { + /// Generates the sitemap and saves it using the provided writer. /// - /// It's recommended to use [`Sitemap::into_bytes()`] or [`Sitemap::into_str()`] - pub fn generate(&self, inner_writer: T) -> T + /// It's recommended to use [`Sitemap::into_bytes`] or [`Sitemap::into_str`] if you need a + /// String or a Vec. + pub fn generate(inner_writer: T, urls: &[UrlEntry]) -> T where T: std::io::Write, { @@ -161,23 +181,24 @@ impl<'a> Sitemap<'a> { .write_event(Event::Start(urlset)) .expect("error opening urlset"); - for url in self.urls.iter() { + for entry in urls { writer .write_event(Event::Start(BytesStart::borrowed_name(b"url"))) .expect("error opening url"); - write_tag(&mut writer, "loc", &url.loc); - if let Some(lastmod) = &url.lastmod { + write_tag(&mut writer, "loc", entry.loc.as_str()); + + if let Some(lastmod) = &entry.lastmod { write_tag( &mut writer, "lastmod", &lastmod.to_rfc3339_opts(SecondsFormat::Secs, true), ); } - if let Some(priority) = &url.priority { + if let Some(priority) = &entry.priority { write_tag(&mut writer, "priority", &format!("{:.1}", priority)) } - if let Some(changefreq) = &url.changefreq { + if let Some(changefreq) = &entry.changefreq { write_tag(&mut writer, "changefreq", &changefreq.to_string()); } @@ -194,17 +215,17 @@ impl<'a> Sitemap<'a> { } /// Generates the sitemap. - pub fn into_bytes(self) -> Vec { + pub fn into_bytes(urls: &[UrlEntry]) -> Vec { let inner = Cursor::new(Vec::new()); - let result = self.generate(inner); + let result = Sitemap::generate(inner, urls); result.into_inner() } /// Generates the sitemap returning a string. - pub fn into_str(self) -> String { - let bytes = self.into_bytes(); - let res = std::str::from_utf8(&bytes).expect("error parsing sitemap bytes to str"); - res.to_owned() + pub fn into_str(urls: &[UrlEntry]) -> Result { + let bytes = Sitemap::into_bytes(urls); + let res = std::str::from_utf8(&bytes)?; + Ok(res.to_owned()) } } @@ -216,38 +237,54 @@ mod tests { fn it_works() { use chrono::Utc; - let mut sitemap = Sitemap::new(); - sitemap.urls.push(Url::new("https://domain.com/")); + let urls = vec![ + // Builder pattern + UrlEntryBuilder::default() + .loc("https://domain.com".parse().unwrap()) + .priority(0.2) + .build() + .unwrap(), + // Using new + UrlEntry::new( + "https://domain.com/some_url".parse().unwrap(), + None, + None, + None, + ), + // Initializing the struct. + UrlEntry { + loc: "https://domain.com/another".parse().unwrap(), + priority: None, + changefreq: Some(ChangeFreq::Always), + lastmod: None, + }, + UrlEntry { + loc: "https://domain.com/url".parse().unwrap(), + changefreq: Some(ChangeFreq::Daily), + priority: Some(0.8), + lastmod: Some(Utc::now()), + }, + UrlEntry { + loc: "https://domain.com/aa".parse().unwrap(), + changefreq: Some(ChangeFreq::Monthly), + priority: None, + lastmod: None, + }, + UrlEntry { + loc: "https://domain.com/bb".parse().unwrap(), + changefreq: None, + priority: None, + lastmod: None, + }, + UrlEntry { + loc: "https://domain.com/bb&id=''".parse().unwrap(), + changefreq: None, + priority: Some(0.4), + lastmod: None, + }, + ]; - sitemap.urls.push(Url { - loc: "https://domain.com/url", - changefreq: Some(ChangeFreq::Daily), - priority: Some(0.8), - lastmod: Some(Utc::now()), - }); - - sitemap.urls.push(Url { - loc: "https://domain.com/aa", - changefreq: Some(ChangeFreq::Monthly), - priority: None, - lastmod: None, - }); - - sitemap.urls.push(Url { - loc: "https://domain.com/bb", - changefreq: None, - priority: None, - lastmod: None, - }); - - sitemap.urls.push(Url { - loc: "https://domain.com/bb&id=''", - changefreq: None, - priority: None, - lastmod: None, - }); - - sitemap.into_str(); + Sitemap::into_str(&urls).unwrap(); } #[test]