I had already found Encog as the framework for the AI / Machine learning algorithms, for this exercise I needed an RSS reader and a HTML parser.
The 2 libraries I ended up using were:
ROME
JSoup
For general other utilities and collection manipulations I used:
Google Guava
I kept the list of blogs short, included some of the software bloggers I follow, just to make testing quick, had to alter the %'s a little from the implementation in (PCI), but still got the desired result.
Blogs Used:
http://blog.guykawasaki.com/index.rdf
http://blog.outer-court.com/rss.xml
http://flagrantdisregard.com/index.php/feed/
http://gizmodo.com/index.xml
http://googleblog.blogspot.com/rss.xml
http://radar.oreilly.com/index.rdf
http://www.wired.com/rss/index.xml
http://feeds.feedburner.com/codinghorror
http://feeds.feedburner.com/joelonsoftware
http://martinfowler.com/feed.atom
http://www.briandupreez.net/feeds/posts/default
For the implementation I just went with a main class and a reader class:
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package net.briandupreez.pci.data; | |
import com.google.common.base.Predicates; | |
import com.google.common.collect.Collections2; | |
import com.sun.syndication.feed.synd.SyndCategoryImpl; | |
import com.sun.syndication.feed.synd.SyndContent; | |
import com.sun.syndication.feed.synd.SyndEntryImpl; | |
import com.sun.syndication.feed.synd.SyndFeed; | |
import com.sun.syndication.io.SyndFeedInput; | |
import com.sun.syndication.io.XmlReader; | |
import org.jsoup.Jsoup; | |
import org.jsoup.nodes.Document; | |
import org.jsoup.nodes.Element; | |
import org.jsoup.select.Elements; | |
import java.net.URL; | |
import java.util.*; | |
public class FeedReader { | |
@SuppressWarnings("unchecked") | |
public static Set<String> determineAllUniqueWords(final String url, final Set<String> blogWordList) { | |
boolean ok = false; | |
try { | |
URL feedUrl = new URL(url); | |
SyndFeedInput input = new SyndFeedInput(); | |
SyndFeed feed = input.build(new XmlReader(feedUrl)); | |
final List<SyndEntryImpl> entries = feed.getEntries(); | |
for (final SyndEntryImpl entry : entries) { | |
blogWordList.addAll(cleanAndSplitString(entry.getTitle())); | |
blogWordList.addAll(doCategories(entry)); | |
blogWordList.addAll(doDescription(entry)); | |
blogWordList.addAll(doContent(entry)); | |
} | |
ok = true; | |
} catch (Exception ex) { | |
ex.printStackTrace(); | |
System.out.println("ERROR: " + url + "\n" + ex.getMessage()); | |
} | |
if (!ok) { | |
System.out.println("FeedReader reads and prints any RSS/Atom feed type."); | |
System.out.println("The first parameter must be the URL of the feed to read."); | |
} | |
return blogWordList; | |
} | |
@SuppressWarnings("unchecked") | |
private static List<String> doContent(final SyndEntryImpl entry) { | |
List<String> blogWordList = new ArrayList<>(); | |
final List<SyndContent> contents = entry.getContents(); | |
if (contents != null) { | |
for (final SyndContent syndContent : contents) { | |
if ("text/html".equals(syndContent.getMode())) { | |
blogWordList.addAll(stripHtmlAndAddText(syndContent)); | |
} else { | |
blogWordList.addAll(cleanAndSplitString(syndContent.getValue())); | |
} | |
} | |
} | |
return blogWordList; | |
} | |
private static List<String> doDescription(final SyndEntryImpl entry) { | |
final List<String> blogWordList = new ArrayList<>(); | |
final SyndContent description = entry.getDescription(); | |
if (description != null) { | |
if ("text/html".equals(description.getType())) { | |
blogWordList.addAll(stripHtmlAndAddText(description)); | |
} else { | |
blogWordList.addAll(cleanAndSplitString(description.getValue())); | |
} | |
} | |
return blogWordList; | |
} | |
@SuppressWarnings("unchecked") | |
private static List<String> doCategories(final SyndEntryImpl entry) { | |
final List<String> blogWordList = new ArrayList<>(); | |
final List<SyndCategoryImpl> categories = entry.getCategories(); | |
for (final SyndCategoryImpl category : categories) { | |
blogWordList.add(category.getName().toLowerCase()); | |
} | |
return blogWordList; | |
} | |
private static List<String> stripHtmlAndAddText(final SyndContent description) { | |
String html = description.getValue(); | |
Document document = Jsoup.parse(html); | |
Elements elements = document.getAllElements(); | |
final List<String> allWords = new ArrayList<>(); | |
for (final Element element : elements) { | |
allWords.addAll(cleanAndSplitString(element.text())); | |
} | |
return allWords; | |
} | |
private static List<String> cleanAndSplitString(final String input) { | |
if (input != null) { | |
final String[] dic = input.toLowerCase().replaceAll("\\p{Punct}", "").replaceAll("\\p{Digit}", "").split("\\s+"); | |
return Arrays.asList(dic); | |
} | |
return new ArrayList<>(); | |
} | |
@SuppressWarnings("unchecked") | |
public static Map<String, Double> countWords(final String url, final Set<String> blogWords) { | |
final Map<String, Double> resultMap = new TreeMap<>(); | |
try { | |
URL feedUrl = new URL(url); | |
SyndFeedInput input = new SyndFeedInput(); | |
SyndFeed feed = input.build(new XmlReader(feedUrl)); | |
final List<SyndEntryImpl> entries = feed.getEntries(); | |
final List<String> allBlogWords = new ArrayList<>(); | |
for (final SyndEntryImpl entry : entries) { | |
allBlogWords.addAll(cleanAndSplitString(entry.getTitle())); | |
allBlogWords.addAll(doCategories(entry)); | |
allBlogWords.addAll(doDescription(entry)); | |
allBlogWords.addAll(doContent(entry)); | |
} | |
for (final String word : blogWords) { | |
resultMap.put(word, (double) Collections2.filter(allBlogWords, Predicates.equalTo(word)).size()); | |
} | |
} catch (Exception ex) { | |
ex.printStackTrace(); | |
System.out.println("ERROR: " + url + "\n" + ex.getMessage()); | |
} | |
return resultMap; | |
} | |
} |
Main:
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package net.briandupreez.pci.data; | |
import com.google.common.base.Predicates; | |
import com.google.common.collect.Maps; | |
import com.google.common.io.Resources; | |
import com.google.common.primitives.Doubles; | |
import org.encog.ml.MLCluster; | |
import org.encog.ml.data.MLDataPair; | |
import org.encog.ml.data.MLDataSet; | |
import org.encog.ml.data.basic.BasicMLData; | |
import org.encog.ml.data.basic.BasicMLDataPair; | |
import org.encog.ml.data.basic.BasicMLDataSet; | |
import org.encog.ml.kmeans.KMeansClustering; | |
import java.io.BufferedReader; | |
import java.io.FileReader; | |
import java.io.IOException; | |
import java.util.*; | |
public class FeedReaderMain { | |
public static void main(String[] args) { | |
final FeedReaderMain feedReaderMain = new FeedReaderMain(); | |
try { | |
feedReaderMain.run(); | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} | |
} | |
public void run() throws IOException { | |
final String file = Resources.getResource("short-feedlist.txt").getFile(); | |
final Set<String> blogWords = determineWordCompleteList(file); | |
final Map<String, Map<String, Double>> blogWordCount = countWordsPerBlog(file, blogWords); | |
//strip out the outlying words | |
stripOutlyingWords(blogWords, blogWordCount); | |
performCusteringAndDisplay(blogWordCount); | |
} | |
private void performCusteringAndDisplay(final Map<String, Map<String, Double>> blogWordCount) { | |
final BasicMLDataSet set = new BasicMLDataSet(); | |
final Map<String, List<Double>> inputMap = new HashMap<>(); | |
for (final Map.Entry<String, Map<String, Double>> entry : blogWordCount.entrySet()) { | |
final Map<String, Double> mainValues = entry.getValue(); | |
final double[] elements = Doubles.toArray(mainValues.values()); | |
List<Double> listInput = Doubles.asList(elements); | |
inputMap.put(entry.getKey(), listInput); | |
set.add(new BasicMLData(elements)); | |
} | |
final KMeansClustering kmeans = new KMeansClustering(3, set); | |
kmeans.iteration(150); | |
// Display the cluster | |
int i = 1; | |
for (final MLCluster cluster : kmeans.getClusters()) { | |
System.out.println("*** Cluster " + (i++) + " ***"); | |
final MLDataSet ds = cluster.createDataSet(); | |
final MLDataPair pair = BasicMLDataPair.createPair( | |
ds.getInputSize(), ds.getIdealSize()); | |
for (int j = 0; j < ds.getRecordCount(); j++) { | |
ds.getRecord(j, pair); | |
List<Double> listInput = Doubles.asList(pair.getInputArray()); | |
System.out.println(Maps.filterValues(inputMap, Predicates.equalTo(listInput)).keySet().toString()); | |
} | |
} | |
} | |
private Map<String, Map<String, Double>> countWordsPerBlog(String file, Set<String> blogWords) throws IOException { | |
BufferedReader reader; | |
String line; | |
reader = new BufferedReader(new FileReader(file)); | |
final Map<String, Map<String, Double>> blogWordCount = new HashMap<>(); | |
while ((line = reader.readLine()) != null) { | |
final Map<String, Double> wordCounts = FeedReader.countWords(line, blogWords); | |
blogWordCount.put(line, wordCounts); | |
} | |
return blogWordCount; | |
} | |
private Set<String> determineWordCompleteList(final String file) throws IOException { | |
FileReader fileReader = new FileReader(file); | |
BufferedReader reader = new BufferedReader(fileReader); | |
String line; | |
Set<String> blogWords = new HashSet<>(); | |
while ((line = reader.readLine()) != null) { | |
blogWords = FeedReader.determineAllUniqueWords(line, blogWords); | |
System.out.println("Size: " + blogWords.size()); | |
} | |
return blogWords; | |
} | |
private void stripOutlyingWords(final Set<String> blogWords, final Map<String, Map<String, Double>> blogWordCount) { | |
final Iterator<String> wordIter = blogWords.iterator(); | |
final double listSize = blogWords.size(); | |
while (wordIter.hasNext()) { | |
final String word = wordIter.next(); | |
double wordCount = 0; | |
for (final Map<String, Double> values : blogWordCount.values()) { | |
wordCount += values.get(word) != null ? values.get(word) : 0; | |
} | |
double percentage = (wordCount / listSize) * 100; | |
if (percentage < 0.1 || percentage > 20 || word.length() < 3) { | |
wordIter.remove(); | |
for (final Map<String, Double> values : blogWordCount.values()) { | |
values.remove(word); | |
} | |
} else { | |
System.out.println("\t keeping: " + word + " Percentage:" + percentage); | |
} | |
} | |
} | |
} |
The Results:
*** Cluster 1 ***
[http://www.briandupreez.net/feeds/posts/default]
*** Cluster 2 ***
[http://blog.guykawasaki.com/index.rdf]
[http://radar.oreilly.com/index.rdf]
[http://googleblog.blogspot.com/rss.xml]
[http://blog.outer-court.com/rss.xml]
[http://gizmodo.com/index.xml]
[http://flagrantdisregard.com/index.php/feed/]
[http://www.wired.com/rss/index.xml]
*** Cluster 3 ***
[http://feeds.feedburner.com/joelonsoftware]
[http://feeds.feedburner.com/codinghorror]
[http://martinfowler.com/feed.atom]
This message is perfect.
ReplyDeleteMua vé tại đại lý vé máy bay Aivivu, tham khảo
ReplyDeletevé máy bay đi Mỹ bao nhiêu
giá vé máy bay từ mỹ về vn
vé máy bay khứ hồi từ đức về việt nam
giá vé máy bay nga về việt nam
các chuyến bay từ anh về việt nam
chuyến bay từ paris về hà nội
chuyến bay chuyên gia trung quốc
Lovely ppost
ReplyDelete