Zen in the art of IT: Blog Categorisation using Encog, ROME, JSoup and Google Guava

Continuing with Programming Collection Intelligence (PCI) the next exercise was using the distance scores to pigeonhole a list of blogs based on the words used within the relevant blog.

I had already found Encog as the framework for the AI / Machine learning algorithms, for this exercise I needed an RSS reader and a HTML parser.
The 2 libraries I ended up using were:
ROME
JSoup

For general other utilities and collection manipulations I used:
Google Guava

I kept the list of blogs short, included some of the software bloggers I follow, just to make testing quick, had to alter the %'s a little from the implementation in (PCI), but still got the desired result.

Blogs Used:

http://blog.guykawasaki.com/index.rdf
http://blog.outer-court.com/rss.xml
http://flagrantdisregard.com/index.php/feed/
http://gizmodo.com/index.xml
http://googleblog.blogspot.com/rss.xml
http://radar.oreilly.com/index.rdf
http://www.wired.com/rss/index.xml
http://feeds.feedburner.com/codinghorror
http://feeds.feedburner.com/joelonsoftware
http://martinfowler.com/feed.atom
http://www.briandupreez.net/feeds/posts/default

For the implementation I just went with a main class and a reader class:

	package net.briandupreez.pci.data;

	import com.google.common.base.Predicates;
	import com.google.common.collect.Collections2;
	import com.sun.syndication.feed.synd.SyndCategoryImpl;
	import com.sun.syndication.feed.synd.SyndContent;
	import com.sun.syndication.feed.synd.SyndEntryImpl;
	import com.sun.syndication.feed.synd.SyndFeed;
	import com.sun.syndication.io.SyndFeedInput;
	import com.sun.syndication.io.XmlReader;
	import org.jsoup.Jsoup;
	import org.jsoup.nodes.Document;
	import org.jsoup.nodes.Element;
	import org.jsoup.select.Elements;

	import java.net.URL;
	import java.util.*;

	public class FeedReader {


	@SuppressWarnings("unchecked")
	public static Set<String> determineAllUniqueWords(final String url, final Set<String> blogWordList) {

	boolean ok = false;
	try {
	URL feedUrl = new URL(url);

	SyndFeedInput input = new SyndFeedInput();
	SyndFeed feed = input.build(new XmlReader(feedUrl));

	final List<SyndEntryImpl> entries = feed.getEntries();
	for (final SyndEntryImpl entry : entries) {
	blogWordList.addAll(cleanAndSplitString(entry.getTitle()));
	blogWordList.addAll(doCategories(entry));
	blogWordList.addAll(doDescription(entry));
	blogWordList.addAll(doContent(entry));
	}

	ok = true;
	} catch (Exception ex) {
	ex.printStackTrace();
	System.out.println("ERROR: " + url + "\n" + ex.getMessage());
	}

	if (!ok) {
	System.out.println("FeedReader reads and prints any RSS/Atom feed type.");
	System.out.println("The first parameter must be the URL of the feed to read.");
	}

	return blogWordList;

	}

	@SuppressWarnings("unchecked")
	private static List<String> doContent(final SyndEntryImpl entry) {
	List<String> blogWordList = new ArrayList<>();
	final List<SyndContent> contents = entry.getContents();
	if (contents != null) {
	for (final SyndContent syndContent : contents) {
	if ("text/html".equals(syndContent.getMode())) {
	blogWordList.addAll(stripHtmlAndAddText(syndContent));
	} else {
	blogWordList.addAll(cleanAndSplitString(syndContent.getValue()));
	}
	}
	}
	return blogWordList;
	}

	private static List<String> doDescription(final SyndEntryImpl entry) {
	final List<String> blogWordList = new ArrayList<>();
	final SyndContent description = entry.getDescription();
	if (description != null) {
	if ("text/html".equals(description.getType())) {
	blogWordList.addAll(stripHtmlAndAddText(description));
	} else {
	blogWordList.addAll(cleanAndSplitString(description.getValue()));
	}
	}
	return blogWordList;
	}

	@SuppressWarnings("unchecked")
	private static List<String> doCategories(final SyndEntryImpl entry) {
	final List<String> blogWordList = new ArrayList<>();
	final List<SyndCategoryImpl> categories = entry.getCategories();
	for (final SyndCategoryImpl category : categories) {
	blogWordList.add(category.getName().toLowerCase());
	}
	return blogWordList;
	}


	private static List<String> stripHtmlAndAddText(final SyndContent description) {
	String html = description.getValue();
	Document document = Jsoup.parse(html);
	Elements elements = document.getAllElements();
	final List<String> allWords = new ArrayList<>();
	for (final Element element : elements) {
	allWords.addAll(cleanAndSplitString(element.text()));
	}
	return allWords;
	}

	private static List<String> cleanAndSplitString(final String input) {
	if (input != null) {
	final String[] dic = input.toLowerCase().replaceAll("\\p{Punct}", "").replaceAll("\\p{Digit}", "").split("\\s+");
	return Arrays.asList(dic);
	}
	return new ArrayList<>();

	}

	@SuppressWarnings("unchecked")
	public static Map<String, Double> countWords(final String url, final Set<String> blogWords) {

	final Map<String, Double> resultMap = new TreeMap<>();
	try {
	URL feedUrl = new URL(url);
	SyndFeedInput input = new SyndFeedInput();
	SyndFeed feed = input.build(new XmlReader(feedUrl));

	final List<SyndEntryImpl> entries = feed.getEntries();
	final List<String> allBlogWords = new ArrayList<>();
	for (final SyndEntryImpl entry : entries) {
	allBlogWords.addAll(cleanAndSplitString(entry.getTitle()));
	allBlogWords.addAll(doCategories(entry));
	allBlogWords.addAll(doDescription(entry));
	allBlogWords.addAll(doContent(entry));
	}
	for (final String word : blogWords) {
	resultMap.put(word, (double) Collections2.filter(allBlogWords, Predicates.equalTo(word)).size());
	}

	} catch (Exception ex) {
	ex.printStackTrace();
	System.out.println("ERROR: " + url + "\n" + ex.getMessage());
	}


	return resultMap;
	}
	}

view raw gistfile1.java hosted with ❤ by GitHub

Main:

	package net.briandupreez.pci.data;

	import com.google.common.base.Predicates;
	import com.google.common.collect.Maps;
	import com.google.common.io.Resources;
	import com.google.common.primitives.Doubles;
	import org.encog.ml.MLCluster;
	import org.encog.ml.data.MLDataPair;
	import org.encog.ml.data.MLDataSet;
	import org.encog.ml.data.basic.BasicMLData;
	import org.encog.ml.data.basic.BasicMLDataPair;
	import org.encog.ml.data.basic.BasicMLDataSet;
	import org.encog.ml.kmeans.KMeansClustering;

	import java.io.BufferedReader;
	import java.io.FileReader;
	import java.io.IOException;
	import java.util.*;


	public class FeedReaderMain {

	public static void main(String[] args) {
	final FeedReaderMain feedReaderMain = new FeedReaderMain();
	try {
	feedReaderMain.run();
	} catch (IOException e) {
	e.printStackTrace();
	}
	}

	public void run() throws IOException {
	final String file = Resources.getResource("short-feedlist.txt").getFile();
	final Set<String> blogWords = determineWordCompleteList(file);
	final Map<String, Map<String, Double>> blogWordCount = countWordsPerBlog(file, blogWords);
	//strip out the outlying words
	stripOutlyingWords(blogWords, blogWordCount);
	performCusteringAndDisplay(blogWordCount);
	}

	private void performCusteringAndDisplay(final Map<String, Map<String, Double>> blogWordCount) {
	final BasicMLDataSet set = new BasicMLDataSet();

	final Map<String, List<Double>> inputMap = new HashMap<>();
	for (final Map.Entry<String, Map<String, Double>> entry : blogWordCount.entrySet()) {
	final Map<String, Double> mainValues = entry.getValue();
	final double[] elements = Doubles.toArray(mainValues.values());

	List<Double> listInput = Doubles.asList(elements);
	inputMap.put(entry.getKey(), listInput);
	set.add(new BasicMLData(elements));
	}

	final KMeansClustering kmeans = new KMeansClustering(3, set);
	kmeans.iteration(150);

	// Display the cluster
	int i = 1;
	for (final MLCluster cluster : kmeans.getClusters()) {
	System.out.println("* Cluster " + (i++) + " *");
	final MLDataSet ds = cluster.createDataSet();
	final MLDataPair pair = BasicMLDataPair.createPair(
	ds.getInputSize(), ds.getIdealSize());
	for (int j = 0; j < ds.getRecordCount(); j++) {
	ds.getRecord(j, pair);
	List<Double> listInput = Doubles.asList(pair.getInputArray());
	System.out.println(Maps.filterValues(inputMap, Predicates.equalTo(listInput)).keySet().toString());
	}
	}
	}

	private Map<String, Map<String, Double>> countWordsPerBlog(String file, Set<String> blogWords) throws IOException {
	BufferedReader reader;
	String line;

	reader = new BufferedReader(new FileReader(file));
	final Map<String, Map<String, Double>> blogWordCount = new HashMap<>();
	while ((line = reader.readLine()) != null) {
	final Map<String, Double> wordCounts = FeedReader.countWords(line, blogWords);
	blogWordCount.put(line, wordCounts);
	}
	return blogWordCount;
	}

	private Set<String> determineWordCompleteList(final String file) throws IOException {
	FileReader fileReader = new FileReader(file);
	BufferedReader reader = new BufferedReader(fileReader);
	String line;
	Set<String> blogWords = new HashSet<>();
	while ((line = reader.readLine()) != null) {
	blogWords = FeedReader.determineAllUniqueWords(line, blogWords);
	System.out.println("Size: " + blogWords.size());
	}
	return blogWords;
	}

	private void stripOutlyingWords(final Set<String> blogWords, final Map<String, Map<String, Double>> blogWordCount) {
	final Iterator<String> wordIter = blogWords.iterator();
	final double listSize = blogWords.size();
	while (wordIter.hasNext()) {
	final String word = wordIter.next();
	double wordCount = 0;
	for (final Map<String, Double> values : blogWordCount.values()) {
	wordCount += values.get(word) != null ? values.get(word) : 0;
	}
	double percentage = (wordCount / listSize) * 100;
	if (percentage < 0.1 \|\| percentage > 20 \|\| word.length() < 3) {
	wordIter.remove();
	for (final Map<String, Double> values : blogWordCount.values()) {
	values.remove(word);
	}
	} else {
	System.out.println("\t keeping: " + word + " Percentage:" + percentage);
	}
	}
	}
	}

view raw gistfile1.java hosted with ❤ by GitHub

The Results:

*** Cluster 1 ***
[http://www.briandupreez.net/feeds/posts/default]
*** Cluster 2 ***
[http://blog.guykawasaki.com/index.rdf]
[http://radar.oreilly.com/index.rdf]
[http://googleblog.blogspot.com/rss.xml]
[http://blog.outer-court.com/rss.xml]
[http://gizmodo.com/index.xml]
[http://flagrantdisregard.com/index.php/feed/]
[http://www.wired.com/rss/index.xml]
*** Cluster 3 ***
[http://feeds.feedburner.com/joelonsoftware]
[http://feeds.feedburner.com/codinghorror]
[http://martinfowler.com/feed.atom]

Zen in the art of IT

Sunday, June 16, 2013

Blog Categorisation using Encog, ROME, JSoup and Google Guava

3 comments:

Popular Posts

Followers