|
HtmlLinksExtractor |
|
1 /* 2 * HtmlLinkExtractor.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Cristian URSU, 16/Nov/2001 12 * 13 * $Id: HtmlLinksExtractor.java,v 1.2 2001/11/20 20:07:57 cursu Exp $ 14 */ 15 16 package gate.util; 17 18 import javax.swing.text.html.*; 19 import javax.swing.text.html.parser.*; 20 import javax.swing.text.html.HTMLEditorKit.*; 21 import javax.swing.text.*; 22 import java.util.*; 23 import java.io.*; 24 25 /** 26 * This class extracts links from HTML files. 27 * Implements the behaviour of the HTML reader. 28 * Methods of an object of this class are called by the HTML parser when 29 * events will appear. 30 */ 31 public class HtmlLinksExtractor extends ParserCallback { 32 33 /** Debug flag */ 34 private static final boolean DEBUG = false; 35 36 /** The tag currently being processed */ 37 private HTML.Tag currentTag = null; 38 39 /** This method is called when the HTML parser encounts the beginning 40 * of a tag that means that the tag is paired by an end tag and it's 41 * not an empty one. 42 */ 43 public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) { 44 45 currentTag = t; 46 if (HTML.Tag.A == t){ 47 Out.pr("<" + t); 48 printAttributes(a); 49 Out.pr(">"); 50 }// End if 51 52 if (HTML.Tag.H1 == t){ 53 Out.pr("<" + t); 54 printAttributes(a); 55 Out.pr(">"); 56 }// End if 57 58 }//handleStartTag 59 60 private void printAttributes(MutableAttributeSet a){ 61 if (a == null) return; 62 // Take all the attributes an put them into the feature map 63 if (0 != a.getAttributeCount()){ 64 Enumeration enum = a.getAttributeNames(); 65 while (enum.hasMoreElements()){ 66 Object attribute = enum.nextElement(); 67 Out.pr(" "+ attribute.toString() + "=\"" + 68 a.getAttribute(attribute).toString()+"\""); 69 }// End while 70 }// End if 71 }// printAttributes(); 72 73 /** This method is called when the HTML parser encounts the end of a tag 74 * that means that the tag is paired by a beginning tag 75 */ 76 public void handleEndTag(HTML.Tag t, int pos){ 77 currentTag = null; 78 79 if (HTML.Tag.A == t) 80 Out.pr("</"+t+">\n"); 81 if (HTML.Tag.H1 == t) 82 Out.pr("</"+t+">\n"); 83 84 }//handleEndTag 85 86 /** This method is called when the HTML parser encounts an empty tag 87 */ 88 public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos){ 89 if (HTML.Tag.A == t){ 90 Out.pr("<"+t); 91 printAttributes(a); 92 Out.pr("/>\n"); 93 }// End if 94 95 if (HTML.Tag.H1 == t){ 96 Out.pr("<"+t); 97 printAttributes(a); 98 Out.pr("/>\n"); 99 }// End if 100 } // handleSimpleTag 101 102 /** This method is called when the HTML parser encounts text (PCDATA)*/ 103 public void handleText(char[] text, int pos){ 104 105 if(HTML.Tag.A == currentTag){ 106 //text of tag A 107 String tagText = new String(text); 108 Out.pr(tagText); 109 }// End if 110 111 if(HTML.Tag.H1 == currentTag){ 112 //text of tag A 113 String tagText = new String(text); 114 Out.pr(tagText); 115 }// End if 116 117 }// end handleText(); 118 119 /** 120 * This method is called when the HTML parser encounts an error 121 * it depends on the programmer if he wants to deal with that error 122 */ 123 public void handleError(String errorMsg, int pos) { 124 //Out.println ("ERROR CALLED : " + errorMsg); 125 } 126 127 /** This method is called once, when the HTML parser reaches the end 128 * of its input streamin order to notify the parserCallback that there 129 * is nothing more to parse. 130 */ 131 public void flush() throws BadLocationException{ 132 }// flush 133 134 /** This method is called when the HTML parser encounts a comment 135 */ 136 public void handleComment(char[] text, int pos) { 137 } 138 139 /** 140 * Given a certain folder it lists recursively all the files contained 141 * in that folder. It returns a list of strings representing the file 142 * names 143 */ 144 private static List listAllFiles(File aFile, Set foldersToIgnore){ 145 java.util.List sgmlFileNames = new ArrayList(); 146 java.util.List foldersToExplore = new ArrayList(); 147 if (!aFile.isDirectory()){ 148 // add the file to the file list 149 sgmlFileNames.add(aFile.getPath()); 150 return sgmlFileNames; 151 }// End if 152 listFilesRec(aFile,sgmlFileNames,foldersToExplore, foldersToIgnore); 153 return sgmlFileNames; 154 } // listAllFiles(); 155 156 /** Helper method for listAllFiles */ 157 private static void listFilesRec(File aFile, 158 java.util.List fileNames, 159 java.util.List foldersToExplore, 160 Set foldersToIgnore){ 161 162 String[] fileList = aFile.list(); 163 for (int i=0; i< fileList.length; i++){ 164 File tmpFile = new File(aFile.getPath()+"\\"+fileList[i]); 165 if (tmpFile.isDirectory()){ 166 // If the file is not included 167 if (!foldersToIgnore.contains(fileList[i])) 168 foldersToExplore.add(tmpFile); 169 }else{ 170 // only process .html files 171 if( 172 ( fileList[i].toLowerCase().endsWith(".html") ) || 173 ( fileList[i].toLowerCase().endsWith(".htm") ) 174 ) fileNames.add(tmpFile.getPath()); 175 }// End if 176 }// End for 177 178 while(!foldersToExplore.isEmpty()){ 179 File folder = (File)foldersToExplore.get(0); 180 foldersToExplore.remove(0); 181 listFilesRec(folder,fileNames,foldersToExplore,foldersToIgnore); 182 }//End while 183 184 } // listFilesRec(); 185 186 /** Extract links from all .html files below a directory */ 187 public static void main(String[] args){ 188 HTMLEditorKit.Parser parser = new ParserDelegator(); 189 // create a new Htmldocument handler 190 HtmlLinksExtractor htmlDocHandler = new HtmlLinksExtractor(); 191 192 if (args.length == 0){ 193 Out.prln( 194 "Eg: java HtmlLinksExtractor g:\\tmp\\relative javadoc img > results.txt" 195 ); 196 return; 197 } 198 // Create a folder file File 199 File htmlFolder = new File(args[0]); 200 Set foldersToIgnore = new HashSet(); 201 for(int i = 1; i<args.length; i++) 202 foldersToIgnore.add(args[i]); 203 204 List htmlFileNames = listAllFiles(htmlFolder,foldersToIgnore); 205 while (!htmlFileNames.isEmpty()){ 206 try{ 207 String htmlFileName = (String) htmlFileNames.get(0); 208 htmlFileNames.remove(0); 209 210 Reader reader = new FileReader(htmlFileName); 211 // parse the HTML document 212 parser.parse(reader, htmlDocHandler, true); 213 } catch (IOException e){ 214 e.printStackTrace(System.out); 215 }// End try 216 }// End while 217 System.err.println("done."); 218 }// main 219 220 }//End class HtmlLinksExtractor 221 222 223 224
|
HtmlLinksExtractor |
|