|
HtmlLinksExtractor |
|
1 /* 2 * HtmlLinkExtractor.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Cristian URSU, 16/Nov/2001 12 * 13 * $Id: HtmlLinksExtractor.java,v 1.3 2001/12/04 15:35:15 hamish Exp $ 14 */ 15 16 package gate.util; 17 18 import javax.swing.text.html.*; 19 import javax.swing.text.html.parser.*; 20 import javax.swing.text.html.HTMLEditorKit.*; 21 import javax.swing.text.*; 22 import java.util.*; 23 import java.io.*; 24 25 /** 26 * This class extracts links from HTML files. 27 * <B>It has been hacked</B> to build the contents of 28 * <A HREF="http://gate.ac.uk/sitemap.html">http://gate.ac.uk/sitemap.html</A>; 29 * you <B>probably don't want to use it</B> for anything else! 30 * <P> 31 * Implements the behaviour of the HTML reader. 32 * Methods of an object of this class are called by the HTML parser when 33 * events will appear. 34 */ 35 public class HtmlLinksExtractor extends ParserCallback { 36 37 /** Debug flag */ 38 private static final boolean DEBUG = false; 39 40 /** The tag currently being processed */ 41 private HTML.Tag currentTag = null; 42 43 /** whether we've done a title before */ 44 static boolean firstTitle = true; 45 46 /** will contain </UL> after first title */ 47 static String endUl = ""; 48 49 /** Name of the file we're currently processing */ 50 static String currFile = ""; 51 52 /** Path to the file we're currently processing */ 53 static String currPath = ""; 54 55 /** This method is called when the HTML parser encounts the beginning 56 * of a tag that means that the tag is paired by an end tag and it's 57 * not an empty one. 58 */ 59 public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) { 60 61 currentTag = t; 62 if (HTML.Tag.A == t){ 63 Out.pr("<LI><" + t); 64 String href = ""; 65 Enumeration e = a.getAttributeNames(); 66 while(e.hasMoreElements()) { 67 HTML.Attribute name = (HTML.Attribute) e.nextElement(); 68 String value = (String) a.getAttribute(name); 69 70 if(name == HTML.Attribute.HREF) { 71 if( 72 value.startsWith("http:") || value.startsWith("HTTP:") || 73 value.startsWith("file:") || value.startsWith("FILE:") || 74 value.startsWith("mailto:") || value.startsWith("MAILTO:") || 75 value.startsWith("ftp:") || value.startsWith("FTP:") 76 ) 77 Out.pr(" HREF=\"" + value + "\""); 78 else { // if it is a relative path.... 79 Out.pr(" HREF=\"" + currPath + "/" + value + "\""); 80 } 81 } 82 } // while 83 84 Out.pr(">"); 85 }// End if 86 87 if (HTML.Tag.TITLE == t){ 88 Out.pr(endUl + "<H3>"); 89 if(firstTitle) { firstTitle = false; endUl = "</UL>"; } 90 }// End if 91 92 }//handleStartTag 93 94 private void printAttributes(MutableAttributeSet a){ 95 if (a == null) return; 96 // Take all the attributes an put them into the feature map 97 if (0 != a.getAttributeCount()){ 98 Enumeration enum = a.getAttributeNames(); 99 while (enum.hasMoreElements()){ 100 Object attribute = enum.nextElement(); 101 Out.pr(" "+ attribute.toString() + "=\"" + 102 a.getAttribute(attribute).toString()+"\""); 103 }// End while 104 }// End if 105 }// printAttributes(); 106 107 /** This method is called when the HTML parser encounts the end of a tag 108 * that means that the tag is paired by a beginning tag 109 */ 110 public void handleEndTag(HTML.Tag t, int pos){ 111 currentTag = null; 112 113 if (HTML.Tag.A == t) 114 Out.pr("</"+t+">\n"); 115 if (HTML.Tag.TITLE == t) 116 Out.pr( 117 "</H3></A>\n\n<P>Links in: <A HREF=\"" + currFile + 118 "\">" + currFile + "</A>:\n<UL>\n" 119 ); 120 121 }//handleEndTag 122 123 /** This method is called when the HTML parser encounts an empty tag 124 */ 125 public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos){ 126 if (HTML.Tag.A == t){ 127 Out.pr("<"+t); 128 printAttributes(a); 129 Out.pr("/>\n"); 130 }// End if 131 132 if (HTML.Tag.TITLE == t){ 133 Out.pr("<"+t); 134 printAttributes(a); 135 Out.pr("/>\n"); 136 }// End if 137 } // handleSimpleTag 138 139 /** This method is called when the HTML parser encounts text (PCDATA)*/ 140 public void handleText(char[] text, int pos){ 141 142 if(HTML.Tag.A == currentTag){ 143 //text of tag A 144 String tagText = new String(text); 145 Out.pr(tagText); 146 }// End if 147 148 if(HTML.Tag.TITLE == currentTag){ 149 //text of tag A 150 String tagText = new String(text); 151 Out.pr(tagText); 152 }// End if 153 154 }// end handleText(); 155 156 /** 157 * This method is called when the HTML parser encounts an error 158 * it depends on the programmer if he wants to deal with that error 159 */ 160 public void handleError(String errorMsg, int pos) { 161 //Out.println ("ERROR CALLED : " + errorMsg); 162 } 163 164 /** This method is called once, when the HTML parser reaches the end 165 * of its input streamin order to notify the parserCallback that there 166 * is nothing more to parse. 167 */ 168 public void flush() throws BadLocationException{ 169 }// flush 170 171 /** This method is called when the HTML parser encounts a comment 172 */ 173 public void handleComment(char[] text, int pos) { 174 } 175 176 /** 177 * Given a certain folder it lists recursively all the files contained 178 * in that folder. It returns a list of strings representing the file 179 * names 180 */ 181 private static List listAllFiles(File aFile, Set foldersToIgnore){ 182 java.util.List sgmlFileNames = new ArrayList(); 183 java.util.List foldersToExplore = new ArrayList(); 184 if (!aFile.isDirectory()){ 185 // add the file to the file list 186 sgmlFileNames.add(aFile.getPath()); 187 return sgmlFileNames; 188 }// End if 189 listFilesRec(aFile,sgmlFileNames,foldersToExplore, foldersToIgnore); 190 return sgmlFileNames; 191 } // listAllFiles(); 192 193 /** Helper method for listAllFiles */ 194 private static void listFilesRec(File aFile, 195 java.util.List fileNames, 196 java.util.List foldersToExplore, 197 Set foldersToIgnore){ 198 199 String[] fileList = aFile.list(); 200 for (int i=0; i< fileList.length; i++){ 201 File tmpFile = new File(aFile.getPath()+"\\"+fileList[i]); 202 if (tmpFile.isDirectory()){ 203 // If the file is not included 204 if (!foldersToIgnore.contains(tmpFile.getName())) { //fileList[i])) { 205 if(DEBUG) { 206 Err.prln("adding dir: " + tmpFile); 207 Err.prln(" name: " + tmpFile.getName()); 208 } 209 foldersToExplore.add(tmpFile); 210 } 211 }else{ 212 // only process .html files 213 if( 214 ( fileList[i].toLowerCase().endsWith(".html") ) || 215 ( fileList[i].toLowerCase().endsWith(".htm") ) 216 ) fileNames.add(tmpFile.getPath()); 217 }// End if 218 }// End for 219 220 while(!foldersToExplore.isEmpty()){ 221 File folder = (File)foldersToExplore.get(0); 222 foldersToExplore.remove(0); 223 listFilesRec(folder,fileNames,foldersToExplore,foldersToIgnore); 224 }//End while 225 226 } // listFilesRec(); 227 228 /** Extract links from all .html files below a directory */ 229 public static void main(String[] args){ 230 HTMLEditorKit.Parser parser = new ParserDelegator(); 231 // create a new Htmldocument handler 232 HtmlLinksExtractor htmlDocHandler = new HtmlLinksExtractor(); 233 234 if (args.length == 0){ 235 Out.prln( 236 "Eg: java HtmlLinksExtractor g:\\tmp\\relative javadoc img > results.txt" 237 ); 238 return; 239 } 240 // Create a folder file File 241 File htmlFolder = new File(args[0]); 242 Set foldersToIgnore = new HashSet(); 243 for(int i = 1; i<args.length; i++) 244 foldersToIgnore.add(args[i]); 245 246 List htmlFileNames = listAllFiles(htmlFolder,foldersToIgnore); 247 //Collections.sort(htmlFileNames); 248 while (!htmlFileNames.isEmpty()){ 249 try{ 250 String htmlFileName = (String) htmlFileNames.get(0); 251 currFile = htmlFileName; 252 currPath = new File(currFile).getParent().toString(); 253 htmlFileNames.remove(0); 254 255 Out.prln("\n\n<A HREF=\"file://" + htmlFileName + "\">"); 256 Reader reader = new FileReader(htmlFileName); 257 // parse the HTML document 258 parser.parse(reader, htmlDocHandler, true); 259 } catch (IOException e){ 260 e.printStackTrace(System.out); 261 }// End try 262 }// End while 263 System.err.println("done."); 264 }// main 265 266 }//End class HtmlLinksExtractor 267 268 269 270
|
HtmlLinksExtractor |
|