| 1 |
|
package yarfraw.utils; |
| 2 |
|
|
| 3 |
|
import java.io.IOException; |
| 4 |
|
import java.io.InputStream; |
| 5 |
|
|
| 6 |
|
import javax.xml.parsers.ParserConfigurationException; |
| 7 |
|
import javax.xml.parsers.SAXParser; |
| 8 |
|
import javax.xml.parsers.SAXParserFactory; |
| 9 |
|
|
| 10 |
|
import org.apache.commons.lang.StringUtils; |
| 11 |
|
import org.apache.commons.logging.Log; |
| 12 |
|
import org.apache.commons.logging.LogFactory; |
| 13 |
|
import org.xml.sax.Attributes; |
| 14 |
|
import org.xml.sax.SAXException; |
| 15 |
|
import org.xml.sax.ext.DefaultHandler2; |
| 16 |
|
|
| 17 |
|
import yarfraw.core.datamodel.FeedFormat; |
| 18 |
|
import yarfraw.core.datamodel.YarfrawException; |
| 19 |
|
|
| 20 |
|
|
| 21 |
|
|
| 22 |
|
|
| 23 |
|
|
| 24 |
|
|
| 25 |
|
|
| 26 |
|
|
| 27 |
|
|
| 28 |
12 |
public class FeedFormatDetector{ |
| 29 |
6 |
private static final Log LOG = LogFactory.getLog(FeedFormatDetector.class); |
| 30 |
|
private static final String RSS = "rss"; |
| 31 |
|
private static final String VERSION = "version"; |
| 32 |
|
private static final String VERSION_20 = "2.0"; |
| 33 |
|
private static final String RDF = "RDF"; |
| 34 |
|
private static final String RDF_NS_URI = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"; |
| 35 |
|
private static final String ATOM10_XMLNS = "http://www.w3.org/2005/Atom"; |
| 36 |
|
private static final String ATOM03_XMLNS = "http://purl.org/atom/ns#"; |
| 37 |
|
private static final String FEED = "feed"; |
| 38 |
|
|
| 39 |
6 |
private static final FormatDetectionHandler FormatDetectionHandler_NON_STRICT = new FormatDetectionHandler(false); |
| 40 |
6 |
private static final FormatDetectionHandler FormatDetectionHandler_STRICT = new FormatDetectionHandler(true); |
| 41 |
|
|
| 42 |
|
|
| 43 |
|
|
| 44 |
|
|
| 45 |
|
|
| 46 |
|
|
| 47 |
|
|
| 48 |
|
|
| 49 |
|
|
| 50 |
|
|
| 51 |
|
|
| 52 |
|
|
| 53 |
|
|
| 54 |
|
|
| 55 |
|
public static FeedFormat getFormat(InputStream stream) throws YarfrawException{ |
| 56 |
180 |
return getFormat(stream, false); |
| 57 |
|
} |
| 58 |
|
|
| 59 |
|
|
| 60 |
|
|
| 61 |
|
|
| 62 |
|
|
| 63 |
|
|
| 64 |
|
|
| 65 |
|
|
| 66 |
|
|
| 67 |
|
|
| 68 |
|
|
| 69 |
|
|
| 70 |
|
|
| 71 |
|
|
| 72 |
|
|
| 73 |
|
|
| 74 |
|
|
| 75 |
|
public static FeedFormat getFormat(InputStream stream, boolean strictFormatDetection) throws YarfrawException{ |
| 76 |
189 |
if(stream == null){ |
| 77 |
0 |
throw new IllegalArgumentException("Null stream received"); |
| 78 |
|
} |
| 79 |
189 |
SAXParserFactory factory = SAXParserFactory.newInstance(); |
| 80 |
189 |
factory.setNamespaceAware(true); |
| 81 |
|
try { |
| 82 |
189 |
SAXParser parser = factory.newSAXParser(); |
| 83 |
189 |
parser.parse(stream, strictFormatDetection? FormatDetectionHandler_STRICT : FormatDetectionHandler_NON_STRICT); |
| 84 |
|
|
| 85 |
0 |
return FeedFormat.UNKNOWN; |
| 86 |
|
} |
| 87 |
0 |
catch (ParserConfigurationException e) { |
| 88 |
0 |
throw new YarfrawException("Format Detection Failed", e); |
| 89 |
|
} |
| 90 |
189 |
catch(EarlyTerminationException e){ |
| 91 |
189 |
return e.getFormat(); |
| 92 |
|
} |
| 93 |
0 |
catch (SAXException e) { |
| 94 |
0 |
throw new YarfrawException("Format Detection Failed", e); |
| 95 |
|
} |
| 96 |
0 |
catch (IOException e) { |
| 97 |
0 |
throw new YarfrawException("Format Detection Failed", e); |
| 98 |
|
} |
| 99 |
|
} |
| 100 |
|
|
| 101 |
|
|
| 102 |
|
private static class FormatDetectionHandler extends DefaultHandler2{ |
| 103 |
12 |
private boolean _strict = false; |
| 104 |
12 |
public FormatDetectionHandler(boolean strict){ |
| 105 |
12 |
_strict = strict; |
| 106 |
12 |
} |
| 107 |
|
public void startElement(String uri, String localName, |
| 108 |
|
String qName, Attributes attributes) throws EarlyTerminationException{ |
| 109 |
|
|
| 110 |
|
|
| 111 |
189 |
if(RSS.equals(localName)){ |
| 112 |
159 |
String version = attributes.getValue(StringUtils.EMPTY, VERSION); |
| 113 |
159 |
if(!VERSION_20.equals(version) ){ |
| 114 |
18 |
if(!_strict){ |
| 115 |
12 |
LOG.warn("Input RSS feed is of version "+version+", reading it as version 2.0. Version 2.0 should be backward compatibile"); |
| 116 |
|
}else{ |
| 117 |
|
|
| 118 |
6 |
throw new EarlyTerminationException(FeedFormat.UNKNOWN); |
| 119 |
|
} |
| 120 |
|
} |
| 121 |
153 |
throw new EarlyTerminationException(FeedFormat.RSS20); |
| 122 |
30 |
}else if (RDF.equals(localName) && RDF_NS_URI.equals(uri)) { |
| 123 |
15 |
throw new EarlyTerminationException(FeedFormat.RSS10); |
| 124 |
15 |
}else if (FEED.equals(localName)&& ATOM10_XMLNS.equals(uri)) { |
| 125 |
12 |
throw new EarlyTerminationException(FeedFormat.ATOM10); |
| 126 |
3 |
}else if(FEED.equals(localName) && ATOM03_XMLNS.equals(uri)){ |
| 127 |
3 |
throw new EarlyTerminationException(FeedFormat.ATOM03); |
| 128 |
|
}else{ |
| 129 |
|
|
| 130 |
0 |
throw new EarlyTerminationException(FeedFormat.UNKNOWN); |
| 131 |
|
} |
| 132 |
|
} |
| 133 |
|
} |
| 134 |
|
|
| 135 |
|
|
| 136 |
|
|
| 137 |
|
|
| 138 |
|
|
| 139 |
|
|
| 140 |
0 |
private static class EarlyTerminationException extends SAXException{ |
| 141 |
|
private static final long serialVersionUID = 1L; |
| 142 |
|
private FeedFormat _format; |
| 143 |
189 |
public EarlyTerminationException(FeedFormat format){ |
| 144 |
189 |
_format = format; |
| 145 |
189 |
} |
| 146 |
|
public FeedFormat getFormat(){ |
| 147 |
189 |
return _format; |
| 148 |
|
} |
| 149 |
|
} |
| 150 |
|
} |