From 5c45046794e3c93f875d69193bb12d6608d45a8c Mon Sep 17 00:00:00 2001 From: Jokler Date: Sat, 7 Apr 2018 18:19:24 +0200 Subject: Add usefulness rating function to url titles --- src/lib.rs | 14 ++-- src/main.rs | 20 +++--- src/plugin.rs | 2 +- src/plugins/currency.rs | 2 +- src/plugins/emoji.rs | 2 +- src/plugins/factoids/database.rs | 10 +-- src/plugins/factoids/mod.rs | 24 ++++--- src/plugins/factoids/utils.rs | 7 +- src/plugins/help.rs | 2 +- src/plugins/keepnick.rs | 2 +- src/plugins/mod.rs | 10 +-- src/plugins/sed.rs | 10 +-- src/plugins/tell/database.rs | 8 +-- src/plugins/tell/mod.rs | 15 ++-- src/plugins/url.rs | 148 ++++++++++++++++++++++++++------------- src/utils.rs | 106 +++++++++++++++++++--------- 16 files changed, 238 insertions(+), 144 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 37f1225..67a99d0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -46,6 +46,7 @@ extern crate lazy_static; #[macro_use] extern crate log; +extern crate antidote; extern crate chrono; extern crate circular_queue; extern crate humantime; @@ -53,22 +54,21 @@ extern crate irc; extern crate regex; extern crate reqwest; extern crate time; -extern crate antidote; +pub mod error; pub mod plugin; pub mod plugins; pub mod utils; -pub mod error; use std::collections::HashMap; use std::fmt; -use std::thread; use std::sync::Arc; +use std::thread; -pub use irc::client::prelude::*; -pub use irc::error::IrcError; use error::*; use failure::ResultExt; +pub use irc::client::prelude::*; +pub use irc::error::IrcError; use plugin::*; @@ -80,11 +80,11 @@ pub struct Bot<'a> { } impl<'a> Bot<'a> { - /// Creates a `Bot` without any plugins. + /// Creates a `Bot` without any plugins. /// By itself the bot only responds to a few simple CTCP commands /// defined per config file. /// Any other functionality has to be provided by plugins - /// which need to implement [`Plugin`](plugin/trait.Plugin.html). + /// which need to implement [`Plugin`](plugin/trait.Plugin.html). /// To send commands to a plugin /// the message has to start with the plugin's name prefixed by `cmd_prefix`. /// diff --git a/src/main.rs b/src/main.rs index e0b50eb..3660ccd 100644 --- a/src/main.rs +++ b/src/main.rs @@ -21,25 +21,25 @@ extern crate failure; #[macro_use] extern crate log; +use log::{Level, LevelFilter, Metadata, Record}; +use std::collections::HashMap; #[cfg(feature = "mysql")] use std::sync::Arc; -use std::collections::HashMap; -use log::{Level, LevelFilter, Metadata, Record}; -use irc::client::reactor::IrcReactor; use glob::glob; +use irc::client::reactor::IrcReactor; +use frippy::plugins::currency::Currency; +use frippy::plugins::emoji::Emoji; +use frippy::plugins::factoids::Factoids; use frippy::plugins::help::Help; -use frippy::plugins::url::Url; +use frippy::plugins::keepnick::KeepNick; use frippy::plugins::sed::Sed; -use frippy::plugins::emoji::Emoji; use frippy::plugins::tell::Tell; -use frippy::plugins::currency::Currency; -use frippy::plugins::keepnick::KeepNick; -use frippy::plugins::factoids::Factoids; +use frippy::plugins::url::UrlTitles; -use frippy::Config; use failure::Error; +use frippy::Config; #[cfg(feature = "mysql")] embed_migrations!(); @@ -136,7 +136,7 @@ fn run() -> Result<(), Error> { let mut bot = frippy::Bot::new(&prefix); bot.add_plugin(Help::new()); - bot.add_plugin(Url::new(1024)); + bot.add_plugin(UrlTitles::new(1024)); bot.add_plugin(Sed::new(60)); bot.add_plugin(Emoji::new()); bot.add_plugin(Currency::new()); diff --git a/src/plugin.rs b/src/plugin.rs index 6c81741..653ec02 100644 --- a/src/plugin.rs +++ b/src/plugin.rs @@ -1,8 +1,8 @@ //! Definitions required for every `Plugin` use std::fmt; -use irc::client::prelude::*; use error::FrippyError; +use irc::client::prelude::*; /// Describes if a [`Plugin`](trait.Plugin.html) is done working on a /// [`Message`](../../irc/proto/message/struct.Message.html) or if another thread is required. diff --git a/src/plugins/currency.rs b/src/plugins/currency.rs index 99f46c8..e7a802d 100644 --- a/src/plugins/currency.rs +++ b/src/plugins/currency.rs @@ -13,8 +13,8 @@ use self::serde_json::Value; use plugin::*; -use error::FrippyError; use error::ErrorKind as FrippyErrorKind; +use error::FrippyError; use failure::ResultExt; #[derive(PluginName, Default, Debug)] diff --git a/src/plugins/emoji.rs b/src/plugins/emoji.rs index f1d9376..4ec7265 100644 --- a/src/plugins/emoji.rs +++ b/src/plugins/emoji.rs @@ -6,8 +6,8 @@ use irc::client::prelude::*; use plugin::*; -use error::FrippyError; use error::ErrorKind as FrippyErrorKind; +use error::FrippyError; use failure::Fail; use failure::ResultExt; diff --git a/src/plugins/factoids/database.rs b/src/plugins/factoids/database.rs index 7788d7c..321931f 100644 --- a/src/plugins/factoids/database.rs +++ b/src/plugins/factoids/database.rs @@ -1,20 +1,20 @@ #[cfg(feature = "mysql")] extern crate dotenv; +use std::collections::HashMap; #[cfg(feature = "mysql")] use std::sync::Arc; -use std::collections::HashMap; +#[cfg(feature = "mysql")] +use diesel::mysql::MysqlConnection; #[cfg(feature = "mysql")] use diesel::prelude::*; #[cfg(feature = "mysql")] -use diesel::mysql::MysqlConnection; +use failure::ResultExt; #[cfg(feature = "mysql")] use r2d2::Pool; #[cfg(feature = "mysql")] use r2d2_diesel::ConnectionManager; -#[cfg(feature = "mysql")] -use failure::ResultExt; use chrono::NaiveDateTime; @@ -124,8 +124,8 @@ impl Database for Arc>> { } fn delete_factoid(&mut self, name: &str, idx: i32) -> Result<(), FactoidsError> { - use diesel; use self::factoids::columns; + use diesel; let conn = &*self.get().context(ErrorKind::NoConnection)?; match diesel::delete( diff --git a/src/plugins/factoids/mod.rs b/src/plugins/factoids/mod.rs index 10e512a..ba3ee8a 100644 --- a/src/plugins/factoids/mod.rs +++ b/src/plugins/factoids/mod.rs @@ -1,13 +1,13 @@ extern crate rlua; -use std::fmt; -use std::str::FromStr; use self::rlua::prelude::*; -use irc::client::prelude::*; use antidote::RwLock; +use irc::client::prelude::*; +use std::fmt; +use std::str::FromStr; -use time; use chrono::NaiveDateTime; +use time; use plugin::*; pub mod database; @@ -15,11 +15,12 @@ use self::database::Database; mod utils; use self::utils::*; +use utils::Url; -use failure::ResultExt; +use self::error::*; use error::ErrorKind as FrippyErrorKind; use error::FrippyError; -use self::error::*; +use failure::ResultExt; static LUA_SANDBOX: &'static str = include_str!("sandbox.lua"); @@ -52,7 +53,8 @@ impl Factoids { created: NaiveDateTime::from_timestamp(tm.sec, 0u32), }; - Ok(self.factoids.write() + Ok(self.factoids + .write() .insert_factoid(&factoid) .map(|()| "Successfully added!")?) } @@ -75,7 +77,10 @@ impl Factoids { let name = command.tokens.remove(0); let url = &command.tokens[0]; - let content = ::utils::download(url, Some(1024)).context(ErrorKind::Download)?; + let content = Url::from(url.as_ref()) + .max_kib(1024) + .request() + .context(ErrorKind::Download)?; Ok(self.create_factoid(&name, &content, &command.source)?) } @@ -118,7 +123,8 @@ impl Factoids { } }; - let factoid = self.factoids.read() + let factoid = self.factoids + .read() .get_factoid(name, idx) .context(ErrorKind::NotFound)?; diff --git a/src/plugins/factoids/utils.rs b/src/plugins/factoids/utils.rs index 70ac8a7..fd08da1 100644 --- a/src/plugins/factoids/utils.rs +++ b/src/plugins/factoids/utils.rs @@ -3,17 +3,18 @@ extern crate reqwest; use std::thread; use std::time::Duration; -use utils; use super::rlua::prelude::*; +use utils::Url; use self::LuaError::RuntimeError; pub fn download(_: &Lua, url: String) -> Result { - match utils::download(&url, Some(1024)) { + let url = Url::from(url).max_kib(1024); + match url.request() { Ok(v) => Ok(v), Err(e) => Err(RuntimeError(format!( "Failed to download {} - {}", - url, + url.as_str(), e.to_string() ))), } diff --git a/src/plugins/help.rs b/src/plugins/help.rs index 7e3658d..9eb152a 100644 --- a/src/plugins/help.rs +++ b/src/plugins/help.rs @@ -2,8 +2,8 @@ use irc::client::prelude::*; use plugin::*; -use error::FrippyError; use error::ErrorKind as FrippyErrorKind; +use error::FrippyError; use failure::ResultExt; #[derive(PluginName, Default, Debug)] diff --git a/src/plugins/keepnick.rs b/src/plugins/keepnick.rs index 58ac167..aa2e485 100644 --- a/src/plugins/keepnick.rs +++ b/src/plugins/keepnick.rs @@ -2,8 +2,8 @@ use irc::client::prelude::*; use plugin::*; -use error::FrippyError; use error::ErrorKind as FrippyErrorKind; +use error::FrippyError; use failure::ResultExt; #[derive(PluginName, Default, Debug)] diff --git a/src/plugins/mod.rs b/src/plugins/mod.rs index 6aed95e..a8fc818 100644 --- a/src/plugins/mod.rs +++ b/src/plugins/mod.rs @@ -1,9 +1,9 @@ //! Collection of plugins included -pub mod help; -pub mod url; -pub mod sed; -pub mod emoji; -pub mod tell; pub mod currency; +pub mod emoji; pub mod factoids; +pub mod help; pub mod keepnick; +pub mod sed; +pub mod tell; +pub mod url; diff --git a/src/plugins/sed.rs b/src/plugins/sed.rs index 8ccb2f7..f766809 100644 --- a/src/plugins/sed.rs +++ b/src/plugins/sed.rs @@ -1,17 +1,17 @@ -use std::collections::HashMap; +use antidote::RwLock; use circular_queue::CircularQueue; use regex::{Regex, RegexBuilder}; -use antidote::RwLock; +use std::collections::HashMap; use irc::client::prelude::*; use plugin::*; -use failure::Fail; -use failure::ResultExt; +use self::error::*; use error::ErrorKind as FrippyErrorKind; use error::FrippyError; -use self::error::*; +use failure::Fail; +use failure::ResultExt; lazy_static! { static ref RE: Regex = Regex::new(r"^s/((?:\\/|[^/])+)/((?:\\/|[^/])*)/(?:(\w+))?\s*$").unwrap(); diff --git a/src/plugins/tell/database.rs b/src/plugins/tell/database.rs index 42c0d88..522df5a 100644 --- a/src/plugins/tell/database.rs +++ b/src/plugins/tell/database.rs @@ -1,15 +1,15 @@ #[cfg(feature = "mysql")] extern crate dotenv; +use std::collections::HashMap; #[cfg(feature = "mysql")] use std::sync::Arc; -use std::collections::HashMap; -#[cfg(feature = "mysql")] -use diesel::prelude::*; #[cfg(feature = "mysql")] use diesel::mysql::MysqlConnection; #[cfg(feature = "mysql")] +use diesel::prelude::*; +#[cfg(feature = "mysql")] use r2d2::Pool; #[cfg(feature = "mysql")] use r2d2_diesel::ConnectionManager; @@ -138,8 +138,8 @@ impl Database for Arc>> { } fn delete_tells(&mut self, receiver: &str) -> Result<(), TellError> { - use diesel; use self::tells::columns; + use diesel; let conn = &*self.get().context(ErrorKind::NoConnection)?; diesel::delete(tells::table.filter(columns::receiver.eq(receiver))) diff --git a/src/plugins/tell/mod.rs b/src/plugins/tell/mod.rs index 42032be..c681d43 100644 --- a/src/plugins/tell/mod.rs +++ b/src/plugins/tell/mod.rs @@ -1,18 +1,18 @@ -use irc::client::prelude::*; use antidote::RwLock; +use irc::client::prelude::*; -use time; -use std::time::Duration; use chrono::NaiveDateTime; use humantime::format_duration; +use std::time::Duration; +use time; use plugin::*; -use failure::Fail; -use failure::ResultExt; +use self::error::*; use error::ErrorKind as FrippyErrorKind; use error::FrippyError; -use self::error::*; +use failure::Fail; +use failure::ResultExt; pub mod database; use self::database::Database; @@ -98,7 +98,8 @@ impl Tell { } fn on_namelist(&self, client: &IrcClient, channel: &str) -> Result<(), FrippyError> { - let receivers = self.tells.read() + let receivers = self.tells + .read() .get_receivers() .context(FrippyErrorKind::Tell)?; diff --git a/src/plugins/url.rs b/src/plugins/url.rs index ec98900..aba5b0d 100644 --- a/src/plugins/url.rs +++ b/src/plugins/url.rs @@ -5,46 +5,49 @@ use irc::client::prelude::*; use regex::Regex; use plugin::*; -use utils; +use utils::Url; use self::error::*; -use error::FrippyError; use error::ErrorKind as FrippyErrorKind; +use error::FrippyError; use failure::Fail; use failure::ResultExt; lazy_static! { - static ref RE: Regex = Regex::new(r"(^|\s)(https?://\S+)").unwrap(); + static ref URL_RE: Regex = Regex::new(r"(^|\s)(https?://\S+)").unwrap(); + static ref WORD_RE: Regex = Regex::new(r"(\w+)").unwrap(); } #[derive(PluginName, Debug)] -pub struct Url { +pub struct UrlTitles { max_kib: usize, } -impl Url { - /// If a file is larger than `max_kib` KiB the download is stopped - pub fn new(max_kib: usize) -> Url { - Url { max_kib: max_kib } - } - - fn grep_url(&self, msg: &str) -> Option { - let captures = RE.captures(msg)?; - debug!("Url captures: {:?}", captures); +#[derive(Clone, Debug)] +struct Title(String); - Some(captures.get(2)?.as_str().to_owned()) +impl From for Title { + fn from(title: String) -> Self { + Title(title) } +} +impl From for String { + fn from(title: Title) -> Self { + title.0 + } +} - fn get_ogtitle<'a>(&self, body: &str) -> Result<String, UrlError> { - let title = body.find("property=\"og:title\"") +impl Title { + fn find_by_delimiters(body: &str, delimiters: [&str; 3]) -> Result<Self, UrlError> { + let title = body.find(delimiters[0]) .map(|tag| { body[tag..] - .find("content=\"") - .map(|offset| tag + offset + 9) + .find(delimiters[1]) + .map(|offset| tag + offset + delimiters[1].len()) .map(|start| { body[start..] - .find("\"") + .find(delimiters[2]) .map(|offset| start + offset) .map(|end| &body[start..end]) }) @@ -52,53 +55,96 @@ impl Url { .and_then(|s| s.and_then(|s| s)) .ok_or(ErrorKind::MissingTitle)?; - debug!("Title: {:?}", title); + debug!("delimiters: {:?}", delimiters); + debug!("title: {:?}", title); - htmlescape::decode_html(title).map_err(|_| ErrorKind::HtmlDecoding.into()) + htmlescape::decode_html(title) + .map(|t| t.into()) + .map_err(|_| ErrorKind::HtmlDecoding.into()) } - fn get_title<'a>(&self, body: &str) -> Result<String, UrlError> { - let title = body.find("<title") - .map(|tag| { - body[tag..] - .find('>') - .map(|offset| tag + offset + 1) - .map(|start| { - body[start..] - .find("") - .map(|offset| start + offset) - .map(|end| &body[start..end]) - }) - }) - .and_then(|s| s.and_then(|s| s)) - .ok_or(ErrorKind::MissingTitle)?; + fn find_ogtitle<'a>(body: &str) -> Result { + Self::find_by_delimiters(body, ["property=\"og:title\"", "content=\"", "\""]) + } - debug!("Title: {:?}", title); + fn find_title<'a>(body: &str) -> Result { + Self::find_by_delimiters(body, ["", ""]) + } - htmlescape::decode_html(title).map_err(|_| ErrorKind::HtmlDecoding.into()) + // TODO Improve logic + fn is_useful(&self, url: &str) -> bool { + for word in WORD_RE.find_iter(&self.0) { + let w = word.as_str().to_lowercase(); + if w.len() > 2 && !url.to_lowercase().contains(&w) { + return true; + } + } + + return false; + } + + fn into_useful_title<'a>(self, url: &str) -> Result { + if self.is_useful(url) { + Ok(self) + } else { + Err(ErrorKind::UselessTitle)? + } + } + + fn clean_up(self) -> Self { + self.0.trim().replace('\n', "|").replace('\r', "|").into() + } + + pub fn find_useful_ogtitle<'a>(body: &str, url: &str) -> Result { + Self::find_ogtitle(body) + .and_then(|t| t.into_useful_title(url)) + .map(|t| t.clean_up()) + } + + pub fn find_useful_title<'a>(body: &str, url: &str) -> Result { + Self::find_title(body) + .and_then(|t| t.into_useful_title(url)) + .map(|t| t.clean_up()) + } +} + +impl UrlTitles { + /// If a file is larger than `max_kib` KiB the download is stopped + pub fn new(max_kib: usize) -> Self { + UrlTitles { max_kib: max_kib } + } + + fn grep_url<'a>(&self, msg: &'a str) -> Option> { + let captures = URL_RE.captures(msg)?; + debug!("Url captures: {:?}", captures); + + Some(captures.get(2)?.as_str().into()) } fn url(&self, text: &str) -> Result { - let url = self.grep_url(text).ok_or(ErrorKind::MissingUrl)?; - let body = utils::download(&url, Some(self.max_kib)).context(ErrorKind::Download)?; + let url = self.grep_url(text) + .ok_or(ErrorKind::MissingUrl)? + .max_kib(self.max_kib); + let body = url.request().context(ErrorKind::Download)?; - let title = match self.get_ogtitle(&body) { + let title = match Title::find_useful_ogtitle(&body, url.as_str()) { Ok(t) => t, - Err(e) => if e.kind() == ErrorKind::MissingTitle { - self.get_title(&body)? - } else { - Err(e)? - } + Err(e) => match e.kind() { + ErrorKind::MissingTitle | ErrorKind::UselessTitle => { + Title::find_useful_title(&body, url.as_str())? + } + _ => Err(e)?, + }, }; - Ok(title.trim().replace('\n', "|").replace('\r', "|")) + Ok(title.into()) } } -impl Plugin for Url { +impl Plugin for UrlTitles { fn execute(&self, _: &IrcClient, message: &Message) -> ExecutionStatus { match message.command { - Command::PRIVMSG(_, ref msg) => if RE.is_match(msg) { + Command::PRIVMSG(_, ref msg) => if URL_RE.is_match(msg) { ExecutionStatus::RequiresThread } else { ExecutionStatus::Done @@ -151,6 +197,10 @@ pub mod error { #[fail(display = "No title was found")] MissingTitle, + /// Useless title error + #[fail(display = "Title was not helpful")] + UselessTitle, + /// Html decoding error #[fail(display = "Failed to decode Html characters")] HtmlDecoding, diff --git a/src/utils.rs b/src/utils.rs index b6c4cf4..6614095 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,49 +1,85 @@ -use std::str; +use std::borrow::Cow; use std::io::{self, Read}; use reqwest::Client; use reqwest::header::Connection; -use failure::ResultExt; use self::error::{DownloadError, ErrorKind}; +use failure::ResultExt; + +#[derive(Clone, Debug)] +pub struct Url<'a> { + url: Cow<'a, str>, + max_kib: Option, +} + +impl<'a> From for Url<'a> { + fn from(url: String) -> Self { + Url { + url: Cow::from(url), + max_kib: None, + } + } +} + +impl<'a> From<&'a str> for Url<'a> { + fn from(url: &'a str) -> Self { + Url { + url: Cow::from(url), + max_kib: None, + } + } +} + +impl<'a> Url<'a> { + pub fn max_kib(mut self, limit: usize) -> Self { + self.max_kib = Some(limit); + self + } + + /// Downloads the file and converts it to a String. + /// Any invalid bytes are converted to a replacement character. + /// + /// The error indicated either a failed download or + /// that the limit set by max_kib() was reached. + pub fn request(&self) -> Result { + let mut response = Client::new() + .get(self.url.as_ref()) + .header(Connection::close()) + .send() + .context(ErrorKind::Connection)?; -/// Downloads the file and converts it to a String. -/// Any invalid bytes are converted to a replacement character. -/// -/// The error indicated either a failed download or that the DownloadLimit was reached -pub fn download(url: &str, max_kib: Option) -> Result { - let mut response = Client::new() - .get(url) - .header(Connection::close()) - .send() - .context(ErrorKind::Connection)?; - - // 100 kibibyte buffer - let mut buf = [0; 100 * 1024]; - let mut written = 0; - let mut bytes = Vec::new(); - - // Read until we reach EOF or max_kib KiB - loop { - let len = match response.read(&mut buf) { - Ok(0) => break, - Ok(len) => len, - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => Err(e).context(ErrorKind::Read)?, - }; - - bytes.extend_from_slice(&buf[..len]); - written += len; - - // Check if the file is too large to download - if let Some(max_kib) = max_kib { - if written > max_kib * 1024 { - Err(ErrorKind::DownloadLimit)?; + // 100 kibibyte buffer + let mut buf = [0; 100 * 1024]; + let mut written = 0; + let mut bytes = Vec::new(); + + // Read until we reach EOF or max_kib KiB + loop { + let len = match response.read(&mut buf) { + Ok(0) => break, + Ok(len) => len, + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => Err(e).context(ErrorKind::Read)?, + }; + + bytes.extend_from_slice(&buf[..len]); + written += len; + + // Check if the file is too large to download + if let Some(max_kib) = self.max_kib { + if written > max_kib * 1024 { + Err(ErrorKind::DownloadLimit)?; + } } } + + Ok(String::from_utf8_lossy(&bytes).into_owned()) } - Ok(String::from_utf8_lossy(&bytes).into_owned()) + pub fn as_str(&self) -> &str { + &self.url + } } pub mod error { -- cgit v1.2.3-70-g09d2