summaryrefslogtreecommitdiffstats
path: root/src/plugins/url.rs
diff options
context:
space:
mode:
authorJokler <jokler.contact@gmail.com>2018-04-07 18:19:24 +0200
committerJokler <jokler.contact@gmail.com>2018-04-07 18:19:24 +0200
commit5c45046794e3c93f875d69193bb12d6608d45a8c (patch)
treea446511384f7ba7eb0dba2c70970a955cbfe1403 /src/plugins/url.rs
parent4624f7e153769fa97401f5e906c6d17cf1127083 (diff)
downloadfrippy-5c45046794e3c93f875d69193bb12d6608d45a8c.tar.gz
frippy-5c45046794e3c93f875d69193bb12d6608d45a8c.zip
Add usefulness rating function to url titles
Diffstat (limited to 'src/plugins/url.rs')
-rw-r--r--src/plugins/url.rs148
1 files changed, 99 insertions, 49 deletions
diff --git a/src/plugins/url.rs b/src/plugins/url.rs
index ec98900..aba5b0d 100644
--- a/src/plugins/url.rs
+++ b/src/plugins/url.rs
@@ -5,46 +5,49 @@ use irc::client::prelude::*;
use regex::Regex;
use plugin::*;
-use utils;
+use utils::Url;
use self::error::*;
-use error::FrippyError;
use error::ErrorKind as FrippyErrorKind;
+use error::FrippyError;
use failure::Fail;
use failure::ResultExt;
lazy_static! {
- static ref RE: Regex = Regex::new(r"(^|\s)(https?://\S+)").unwrap();
+ static ref URL_RE: Regex = Regex::new(r"(^|\s)(https?://\S+)").unwrap();
+ static ref WORD_RE: Regex = Regex::new(r"(\w+)").unwrap();
}
#[derive(PluginName, Debug)]
-pub struct Url {
+pub struct UrlTitles {
max_kib: usize,
}
-impl Url {
- /// If a file is larger than `max_kib` KiB the download is stopped
- pub fn new(max_kib: usize) -> Url {
- Url { max_kib: max_kib }
- }
-
- fn grep_url(&self, msg: &str) -> Option<String> {
- let captures = RE.captures(msg)?;
- debug!("Url captures: {:?}", captures);
+#[derive(Clone, Debug)]
+struct Title(String);
- Some(captures.get(2)?.as_str().to_owned())
+impl From<String> for Title {
+ fn from(title: String) -> Self {
+ Title(title)
}
+}
+impl From<Title> for String {
+ fn from(title: Title) -> Self {
+ title.0
+ }
+}
- fn get_ogtitle<'a>(&self, body: &str) -> Result<String, UrlError> {
- let title = body.find("property=\"og:title\"")
+impl Title {
+ fn find_by_delimiters(body: &str, delimiters: [&str; 3]) -> Result<Self, UrlError> {
+ let title = body.find(delimiters[0])
.map(|tag| {
body[tag..]
- .find("content=\"")
- .map(|offset| tag + offset + 9)
+ .find(delimiters[1])
+ .map(|offset| tag + offset + delimiters[1].len())
.map(|start| {
body[start..]
- .find("\"")
+ .find(delimiters[2])
.map(|offset| start + offset)
.map(|end| &body[start..end])
})
@@ -52,53 +55,96 @@ impl Url {
.and_then(|s| s.and_then(|s| s))
.ok_or(ErrorKind::MissingTitle)?;
- debug!("Title: {:?}", title);
+ debug!("delimiters: {:?}", delimiters);
+ debug!("title: {:?}", title);
- htmlescape::decode_html(title).map_err(|_| ErrorKind::HtmlDecoding.into())
+ htmlescape::decode_html(title)
+ .map(|t| t.into())
+ .map_err(|_| ErrorKind::HtmlDecoding.into())
}
- fn get_title<'a>(&self, body: &str) -> Result<String, UrlError> {
- let title = body.find("<title")
- .map(|tag| {
- body[tag..]
- .find('>')
- .map(|offset| tag + offset + 1)
- .map(|start| {
- body[start..]
- .find("</title>")
- .map(|offset| start + offset)
- .map(|end| &body[start..end])
- })
- })
- .and_then(|s| s.and_then(|s| s))
- .ok_or(ErrorKind::MissingTitle)?;
+ fn find_ogtitle<'a>(body: &str) -> Result<Self, UrlError> {
+ Self::find_by_delimiters(body, ["property=\"og:title\"", "content=\"", "\""])
+ }
- debug!("Title: {:?}", title);
+ fn find_title<'a>(body: &str) -> Result<Self, UrlError> {
+ Self::find_by_delimiters(body, ["<title", ">", "</title>"])
+ }
- htmlescape::decode_html(title).map_err(|_| ErrorKind::HtmlDecoding.into())
+ // TODO Improve logic
+ fn is_useful(&self, url: &str) -> bool {
+ for word in WORD_RE.find_iter(&self.0) {
+ let w = word.as_str().to_lowercase();
+ if w.len() > 2 && !url.to_lowercase().contains(&w) {
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ fn into_useful_title<'a>(self, url: &str) -> Result<Self, UrlError> {
+ if self.is_useful(url) {
+ Ok(self)
+ } else {
+ Err(ErrorKind::UselessTitle)?
+ }
+ }
+
+ fn clean_up(self) -> Self {
+ self.0.trim().replace('\n', "|").replace('\r', "|").into()
+ }
+
+ pub fn find_useful_ogtitle<'a>(body: &str, url: &str) -> Result<Self, UrlError> {
+ Self::find_ogtitle(body)
+ .and_then(|t| t.into_useful_title(url))
+ .map(|t| t.clean_up())
+ }
+
+ pub fn find_useful_title<'a>(body: &str, url: &str) -> Result<Self, UrlError> {
+ Self::find_title(body)
+ .and_then(|t| t.into_useful_title(url))
+ .map(|t| t.clean_up())
+ }
+}
+
+impl UrlTitles {
+ /// If a file is larger than `max_kib` KiB the download is stopped
+ pub fn new(max_kib: usize) -> Self {
+ UrlTitles { max_kib: max_kib }
+ }
+
+ fn grep_url<'a>(&self, msg: &'a str) -> Option<Url<'a>> {
+ let captures = URL_RE.captures(msg)?;
+ debug!("Url captures: {:?}", captures);
+
+ Some(captures.get(2)?.as_str().into())
}
fn url(&self, text: &str) -> Result<String, UrlError> {
- let url = self.grep_url(text).ok_or(ErrorKind::MissingUrl)?;
- let body = utils::download(&url, Some(self.max_kib)).context(ErrorKind::Download)?;
+ let url = self.grep_url(text)
+ .ok_or(ErrorKind::MissingUrl)?
+ .max_kib(self.max_kib);
+ let body = url.request().context(ErrorKind::Download)?;
- let title = match self.get_ogtitle(&body) {
+ let title = match Title::find_useful_ogtitle(&body, url.as_str()) {
Ok(t) => t,
- Err(e) => if e.kind() == ErrorKind::MissingTitle {
- self.get_title(&body)?
- } else {
- Err(e)?
- }
+ Err(e) => match e.kind() {
+ ErrorKind::MissingTitle | ErrorKind::UselessTitle => {
+ Title::find_useful_title(&body, url.as_str())?
+ }
+ _ => Err(e)?,
+ },
};
- Ok(title.trim().replace('\n', "|").replace('\r', "|"))
+ Ok(title.into())
}
}
-impl Plugin for Url {
+impl Plugin for UrlTitles {
fn execute(&self, _: &IrcClient, message: &Message) -> ExecutionStatus {
match message.command {
- Command::PRIVMSG(_, ref msg) => if RE.is_match(msg) {
+ Command::PRIVMSG(_, ref msg) => if URL_RE.is_match(msg) {
ExecutionStatus::RequiresThread
} else {
ExecutionStatus::Done
@@ -151,6 +197,10 @@ pub mod error {
#[fail(display = "No title was found")]
MissingTitle,
+ /// Useless title error
+ #[fail(display = "Title was not helpful")]
+ UselessTitle,
+
/// Html decoding error
#[fail(display = "Failed to decode Html characters")]
HtmlDecoding,