Sunday, June 16, 2013

Blog Categorisation using Encog, ROME, JSoup and Google Guava

Continuing with Programming Collection Intelligence  (PCI) the next exercise was using the distance scores to pigeonhole a list of blogs based on the words used within the relevant blog.

I had already found Encog as the framework for the AI / Machine learning algorithms, for this exercise I needed an RSS reader and a HTML parser.
The 2 libraries I ended up using were:
ROME
JSoup

For general other utilities and collection manipulations I used:
Google Guava

I kept the list of blogs short, included some of the software bloggers I follow, just to make testing quick, had to alter the %'s a little from the implementation in (PCI), but still got the desired result.

Blogs Used:

http://blog.guykawasaki.com/index.rdf
http://blog.outer-court.com/rss.xml
http://flagrantdisregard.com/index.php/feed/
http://gizmodo.com/index.xml
http://googleblog.blogspot.com/rss.xml
http://radar.oreilly.com/index.rdf
http://www.wired.com/rss/index.xml
http://feeds.feedburner.com/codinghorror
http://feeds.feedburner.com/joelonsoftware
http://martinfowler.com/feed.atom
http://www.briandupreez.net/feeds/posts/default

For the implementation I just went with a main class and a reader class:

package net.briandupreez.pci.data;
import com.google.common.base.Predicates;
import com.google.common.collect.Collections2;
import com.sun.syndication.feed.synd.SyndCategoryImpl;
import com.sun.syndication.feed.synd.SyndContent;
import com.sun.syndication.feed.synd.SyndEntryImpl;
import com.sun.syndication.feed.synd.SyndFeed;
import com.sun.syndication.io.SyndFeedInput;
import com.sun.syndication.io.XmlReader;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.net.URL;
import java.util.*;
public class FeedReader {
@SuppressWarnings("unchecked")
public static Set<String> determineAllUniqueWords(final String url, final Set<String> blogWordList) {
boolean ok = false;
try {
URL feedUrl = new URL(url);
SyndFeedInput input = new SyndFeedInput();
SyndFeed feed = input.build(new XmlReader(feedUrl));
final List<SyndEntryImpl> entries = feed.getEntries();
for (final SyndEntryImpl entry : entries) {
blogWordList.addAll(cleanAndSplitString(entry.getTitle()));
blogWordList.addAll(doCategories(entry));
blogWordList.addAll(doDescription(entry));
blogWordList.addAll(doContent(entry));
}
ok = true;
} catch (Exception ex) {
ex.printStackTrace();
System.out.println("ERROR: " + url + "\n" + ex.getMessage());
}
if (!ok) {
System.out.println("FeedReader reads and prints any RSS/Atom feed type.");
System.out.println("The first parameter must be the URL of the feed to read.");
}
return blogWordList;
}
@SuppressWarnings("unchecked")
private static List<String> doContent(final SyndEntryImpl entry) {
List<String> blogWordList = new ArrayList<>();
final List<SyndContent> contents = entry.getContents();
if (contents != null) {
for (final SyndContent syndContent : contents) {
if ("text/html".equals(syndContent.getMode())) {
blogWordList.addAll(stripHtmlAndAddText(syndContent));
} else {
blogWordList.addAll(cleanAndSplitString(syndContent.getValue()));
}
}
}
return blogWordList;
}
private static List<String> doDescription(final SyndEntryImpl entry) {
final List<String> blogWordList = new ArrayList<>();
final SyndContent description = entry.getDescription();
if (description != null) {
if ("text/html".equals(description.getType())) {
blogWordList.addAll(stripHtmlAndAddText(description));
} else {
blogWordList.addAll(cleanAndSplitString(description.getValue()));
}
}
return blogWordList;
}
@SuppressWarnings("unchecked")
private static List<String> doCategories(final SyndEntryImpl entry) {
final List<String> blogWordList = new ArrayList<>();
final List<SyndCategoryImpl> categories = entry.getCategories();
for (final SyndCategoryImpl category : categories) {
blogWordList.add(category.getName().toLowerCase());
}
return blogWordList;
}
private static List<String> stripHtmlAndAddText(final SyndContent description) {
String html = description.getValue();
Document document = Jsoup.parse(html);
Elements elements = document.getAllElements();
final List<String> allWords = new ArrayList<>();
for (final Element element : elements) {
allWords.addAll(cleanAndSplitString(element.text()));
}
return allWords;
}
private static List<String> cleanAndSplitString(final String input) {
if (input != null) {
final String[] dic = input.toLowerCase().replaceAll("\\p{Punct}", "").replaceAll("\\p{Digit}", "").split("\\s+");
return Arrays.asList(dic);
}
return new ArrayList<>();
}
@SuppressWarnings("unchecked")
public static Map<String, Double> countWords(final String url, final Set<String> blogWords) {
final Map<String, Double> resultMap = new TreeMap<>();
try {
URL feedUrl = new URL(url);
SyndFeedInput input = new SyndFeedInput();
SyndFeed feed = input.build(new XmlReader(feedUrl));
final List<SyndEntryImpl> entries = feed.getEntries();
final List<String> allBlogWords = new ArrayList<>();
for (final SyndEntryImpl entry : entries) {
allBlogWords.addAll(cleanAndSplitString(entry.getTitle()));
allBlogWords.addAll(doCategories(entry));
allBlogWords.addAll(doDescription(entry));
allBlogWords.addAll(doContent(entry));
}
for (final String word : blogWords) {
resultMap.put(word, (double) Collections2.filter(allBlogWords, Predicates.equalTo(word)).size());
}
} catch (Exception ex) {
ex.printStackTrace();
System.out.println("ERROR: " + url + "\n" + ex.getMessage());
}
return resultMap;
}
}
view raw gistfile1.java hosted with ❤ by GitHub

Main:

package net.briandupreez.pci.data;
import com.google.common.base.Predicates;
import com.google.common.collect.Maps;
import com.google.common.io.Resources;
import com.google.common.primitives.Doubles;
import org.encog.ml.MLCluster;
import org.encog.ml.data.MLDataPair;
import org.encog.ml.data.MLDataSet;
import org.encog.ml.data.basic.BasicMLData;
import org.encog.ml.data.basic.BasicMLDataPair;
import org.encog.ml.data.basic.BasicMLDataSet;
import org.encog.ml.kmeans.KMeansClustering;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.*;
public class FeedReaderMain {
public static void main(String[] args) {
final FeedReaderMain feedReaderMain = new FeedReaderMain();
try {
feedReaderMain.run();
} catch (IOException e) {
e.printStackTrace();
}
}
public void run() throws IOException {
final String file = Resources.getResource("short-feedlist.txt").getFile();
final Set<String> blogWords = determineWordCompleteList(file);
final Map<String, Map<String, Double>> blogWordCount = countWordsPerBlog(file, blogWords);
//strip out the outlying words
stripOutlyingWords(blogWords, blogWordCount);
performCusteringAndDisplay(blogWordCount);
}
private void performCusteringAndDisplay(final Map<String, Map<String, Double>> blogWordCount) {
final BasicMLDataSet set = new BasicMLDataSet();
final Map<String, List<Double>> inputMap = new HashMap<>();
for (final Map.Entry<String, Map<String, Double>> entry : blogWordCount.entrySet()) {
final Map<String, Double> mainValues = entry.getValue();
final double[] elements = Doubles.toArray(mainValues.values());
List<Double> listInput = Doubles.asList(elements);
inputMap.put(entry.getKey(), listInput);
set.add(new BasicMLData(elements));
}
final KMeansClustering kmeans = new KMeansClustering(3, set);
kmeans.iteration(150);
// Display the cluster
int i = 1;
for (final MLCluster cluster : kmeans.getClusters()) {
System.out.println("*** Cluster " + (i++) + " ***");
final MLDataSet ds = cluster.createDataSet();
final MLDataPair pair = BasicMLDataPair.createPair(
ds.getInputSize(), ds.getIdealSize());
for (int j = 0; j < ds.getRecordCount(); j++) {
ds.getRecord(j, pair);
List<Double> listInput = Doubles.asList(pair.getInputArray());
System.out.println(Maps.filterValues(inputMap, Predicates.equalTo(listInput)).keySet().toString());
}
}
}
private Map<String, Map<String, Double>> countWordsPerBlog(String file, Set<String> blogWords) throws IOException {
BufferedReader reader;
String line;
reader = new BufferedReader(new FileReader(file));
final Map<String, Map<String, Double>> blogWordCount = new HashMap<>();
while ((line = reader.readLine()) != null) {
final Map<String, Double> wordCounts = FeedReader.countWords(line, blogWords);
blogWordCount.put(line, wordCounts);
}
return blogWordCount;
}
private Set<String> determineWordCompleteList(final String file) throws IOException {
FileReader fileReader = new FileReader(file);
BufferedReader reader = new BufferedReader(fileReader);
String line;
Set<String> blogWords = new HashSet<>();
while ((line = reader.readLine()) != null) {
blogWords = FeedReader.determineAllUniqueWords(line, blogWords);
System.out.println("Size: " + blogWords.size());
}
return blogWords;
}
private void stripOutlyingWords(final Set<String> blogWords, final Map<String, Map<String, Double>> blogWordCount) {
final Iterator<String> wordIter = blogWords.iterator();
final double listSize = blogWords.size();
while (wordIter.hasNext()) {
final String word = wordIter.next();
double wordCount = 0;
for (final Map<String, Double> values : blogWordCount.values()) {
wordCount += values.get(word) != null ? values.get(word) : 0;
}
double percentage = (wordCount / listSize) * 100;
if (percentage < 0.1 || percentage > 20 || word.length() < 3) {
wordIter.remove();
for (final Map<String, Double> values : blogWordCount.values()) {
values.remove(word);
}
} else {
System.out.println("\t keeping: " + word + " Percentage:" + percentage);
}
}
}
}
view raw gistfile1.java hosted with ❤ by GitHub

The Results:


*** Cluster 1 ***
[http://www.briandupreez.net/feeds/posts/default]
*** Cluster 2 ***
[http://blog.guykawasaki.com/index.rdf]
[http://radar.oreilly.com/index.rdf]
[http://googleblog.blogspot.com/rss.xml]
[http://blog.outer-court.com/rss.xml]
[http://gizmodo.com/index.xml]
[http://flagrantdisregard.com/index.php/feed/]
[http://www.wired.com/rss/index.xml]
*** Cluster 3 ***
[http://feeds.feedburner.com/joelonsoftware]
[http://feeds.feedburner.com/codinghorror]
[http://martinfowler.com/feed.atom]

3 comments:

Popular Posts

Followers