Coverage Report - yarfraw.utils.FeedFormatDetector
 
Classes in this File Line Coverage Branch Coverage Complexity
FeedFormatDetector
77% 
100% 
4.833
 
 1  
 package yarfraw.utils;
 2  
 
 3  
 import java.io.IOException;
 4  
 import java.io.InputStream;
 5  
 
 6  
 import javax.xml.parsers.ParserConfigurationException;
 7  
 import javax.xml.parsers.SAXParser;
 8  
 import javax.xml.parsers.SAXParserFactory;
 9  
 
 10  
 import org.apache.commons.lang.StringUtils;
 11  
 import org.apache.commons.logging.Log;
 12  
 import org.apache.commons.logging.LogFactory;
 13  
 import org.xml.sax.Attributes;
 14  
 import org.xml.sax.SAXException;
 15  
 import org.xml.sax.ext.DefaultHandler2;
 16  
 
 17  
 import yarfraw.core.datamodel.FeedFormat;
 18  
 import yarfraw.core.datamodel.YarfrawException;
 19  
 
 20  
 /**
 21  
  * A (somewhat) primitive RSS feed format detection utility class.<br/>
 22  
  * It checks the root element of the input xml stream to determines the format
 23  
  * of a feed.
 24  
  *   
 25  
  * @author jliang
 26  
  *
 27  
  */
 28  12
 public class FeedFormatDetector{
 29  6
   private static final Log LOG = LogFactory.getLog(FeedFormatDetector.class);
 30  
   private static final String RSS = "rss";
 31  
   private static final String VERSION = "version";
 32  
   private static final String VERSION_20 = "2.0";
 33  
   private static final String RDF = "RDF";
 34  
   private static final String RDF_NS_URI = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
 35  
   private static final String ATOM10_XMLNS = "http://www.w3.org/2005/Atom";
 36  
   private static final String ATOM03_XMLNS = "http://purl.org/atom/ns#";
 37  
   private static final String FEED = "feed";
 38  
   
 39  6
   private static final FormatDetectionHandler FormatDetectionHandler_NON_STRICT = new FormatDetectionHandler(false);
 40  6
   private static final FormatDetectionHandler FormatDetectionHandler_STRICT = new FormatDetectionHandler(true);
 41  
   
 42  
   /**
 43  
    * Determines the format of the input feed stream. 
 44  
    * <br/>
 45  
    * Officially, Yarfraw currently only supports {@link FeedFormat these} formats. But, the format detector
 46  
    *  will report RSS 0.9x formats as RSS 2.0 because the FeedReader is able to read them using the RSS 2.0 parser.
 47  
    *  If you want a stricter format detector, you can use <code>getFormat(InputStream stream, boolean strictFormatDetection)</code>
 48  
    *  to pass in a strict enforcement flag to tell the detector you want strict format detection.
 49  
    *  
 50  
    * @param stream input stream of a feed
 51  
    * @return the format of the feed
 52  
    * @throws YarfrawException if unable to detect the format, this usually means the detector 
 53  
    * failed to parse the input stream.
 54  
    */
 55  
   public static FeedFormat getFormat(InputStream stream) throws YarfrawException{
 56  180
     return getFormat(stream, false); //non-strict format detection
 57  
   }
 58  
 
 59  
   /**
 60  
    * Determines the format of the input feed stream. 
 61  
    * <br/>
 62  
    * Officially, Yarfraw currently only supports {@link FeedFormat these} formats. But, the format detector
 63  
    *  will report RSS 0.9x formats as RSS 2.0 because the FeedReader is able to read them using the RSS 2.0 parser.
 64  
    *  If you want a stricter format detector, you can use <code>getFormat(InputStream stream, boolean strictFormatDetection)</code>
 65  
    *  to pass in a strict enforcement flag to tell the detector you want strict format detection.
 66  
    *  
 67  
    * 
 68  
    * @param strictFormatDetection whether to use 'strict' format detection. if set to true, the method will only report
 69  
    * RSS 2.0 when the root element is 'rss' and it has a version 2.0 attribute: &lt;rss version="2.0">
 70  
    * @param stream input stream of a feed
 71  
    * @return the format of the feed
 72  
    * @throws YarfrawException if unable to detect the format, this usually means the detector 
 73  
    * failed to parse the input stream.
 74  
    */
 75  
   public static FeedFormat getFormat(InputStream stream, boolean strictFormatDetection) throws YarfrawException{
 76  189
     if(stream == null){
 77  0
       throw new IllegalArgumentException("Null stream received");
 78  
     }
 79  189
     SAXParserFactory factory = SAXParserFactory.newInstance();
 80  189
     factory.setNamespaceAware(true);
 81  
     try {
 82  189
       SAXParser parser = factory.newSAXParser();
 83  189
       parser.parse(stream, strictFormatDetection? FormatDetectionHandler_STRICT : FormatDetectionHandler_NON_STRICT);
 84  
       //we should never get to here
 85  0
       return FeedFormat.UNKNOWN;
 86  
     }
 87  0
     catch (ParserConfigurationException e) {
 88  0
       throw new YarfrawException("Format Detection Failed", e);
 89  
     }
 90  189
     catch(EarlyTerminationException e){
 91  189
       return e.getFormat(); //should always get to here
 92  
     }
 93  0
     catch (SAXException e) {
 94  0
       throw new YarfrawException("Format Detection Failed", e);
 95  
     }
 96  0
     catch (IOException e) {
 97  0
       throw new YarfrawException("Format Detection Failed", e);
 98  
     }
 99  
   }
 100  
 
 101  
   
 102  
   private static class FormatDetectionHandler extends DefaultHandler2{
 103  12
     private boolean _strict = false;
 104  12
     public FormatDetectionHandler(boolean strict){
 105  12
       _strict = strict;
 106  12
     }
 107  
     public void startElement(String uri, String localName,
 108  
             String qName, Attributes attributes) throws EarlyTerminationException{
 109  
 
 110  
     //just check the root element is enough 
 111  189
       if(RSS.equals(localName)){
 112  159
         String version = attributes.getValue(StringUtils.EMPTY, VERSION);
 113  159
         if(!VERSION_20.equals(version) ){
 114  18
           if(!_strict){
 115  12
             LOG.warn("Input RSS feed is of version "+version+", reading it as version 2.0. Version 2.0 should be backward compatibile");
 116  
           }else{
 117  
             //strict format detection, version number has to match
 118  6
             throw new EarlyTerminationException(FeedFormat.UNKNOWN);
 119  
           }
 120  
         }
 121  153
         throw new EarlyTerminationException(FeedFormat.RSS20);
 122  30
       }else if (RDF.equals(localName) && RDF_NS_URI.equals(uri)) {
 123  15
         throw new EarlyTerminationException(FeedFormat.RSS10);
 124  15
       }else if (FEED.equals(localName)&& ATOM10_XMLNS.equals(uri)) {
 125  12
         throw new EarlyTerminationException(FeedFormat.ATOM10);
 126  3
       }else if(FEED.equals(localName) && ATOM03_XMLNS.equals(uri)){
 127  3
         throw new EarlyTerminationException(FeedFormat.ATOM03);
 128  
       }else{
 129  
        //does not recognize the format from the root element, the format must be unknown
 130  0
         throw new EarlyTerminationException(FeedFormat.UNKNOWN);
 131  
       }
 132  
     }
 133  
   }
 134  
   
 135  
   
 136  
   
 137  
   /**
 138  
    * An exception to be thrown for letting us to terminate the parsing prematurely
 139  
    */
 140  0
   private static class EarlyTerminationException extends SAXException{
 141  
     private static final long serialVersionUID = 1L;
 142  
     private FeedFormat _format;
 143  189
     public EarlyTerminationException(FeedFormat format){
 144  189
       _format = format;
 145  189
     }
 146  
     public FeedFormat getFormat(){
 147  189
       return _format;
 148  
     }
 149  
   }
 150  
 }