1   /*
2    *  HtmlLinkExtractor.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Cristian URSU,  16/Nov/2001
12   *
13   *  $Id: HtmlLinksExtractor.java,v 1.2 2001/11/20 20:07:57 cursu Exp $
14   */
15  
16  package gate.util;
17  
18  import javax.swing.text.html.*;
19  import javax.swing.text.html.parser.*;
20  import javax.swing.text.html.HTMLEditorKit.*;
21  import javax.swing.text.*;
22  import java.util.*;
23  import java.io.*;
24  
25  /**
26   * This class extracts links from HTML files.
27   * Implements the behaviour of the HTML reader.
28   * Methods of an object of this class are called by the HTML parser when
29   * events will appear.
30   */
31  public class HtmlLinksExtractor extends ParserCallback {
32  
33    /** Debug flag */
34    private static final boolean DEBUG = false;
35  
36    /** The tag currently being processed */
37    private HTML.Tag currentTag = null;
38  
39    /** This method is called when the HTML parser encounts the beginning
40      * of a tag that means that the tag is paired by an end tag and it's
41      * not an empty one.
42      */
43    public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
44  
45      currentTag = t;
46      if (HTML.Tag.A == t){
47        Out.pr("<" + t);
48        printAttributes(a);
49        Out.pr(">");
50      }// End if
51  
52      if (HTML.Tag.H1 == t){
53        Out.pr("<" + t);
54        printAttributes(a);
55        Out.pr(">");
56      }// End if
57  
58    }//handleStartTag
59  
60    private void printAttributes(MutableAttributeSet a){
61      if (a == null) return;
62      // Take all the attributes an put them into the feature map
63      if (0 != a.getAttributeCount()){
64        Enumeration enum = a.getAttributeNames();
65        while (enum.hasMoreElements()){
66          Object attribute = enum.nextElement();
67          Out.pr(" "+ attribute.toString() + "=\"" +
68                                    a.getAttribute(attribute).toString()+"\"");
69        }// End while
70      }// End if
71    }// printAttributes();
72  
73     /** This method is called when the HTML parser encounts the end of a tag
74       * that means that the tag is paired by a beginning tag
75       */
76    public void handleEndTag(HTML.Tag t, int pos){
77      currentTag = null;
78  
79      if (HTML.Tag.A == t)
80        Out.pr("</"+t+">\n");
81      if (HTML.Tag.H1 == t)
82        Out.pr("</"+t+">\n");
83  
84    }//handleEndTag
85  
86    /** This method is called when the HTML parser encounts an empty tag
87      */
88    public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos){
89      if (HTML.Tag.A == t){
90        Out.pr("<"+t);
91        printAttributes(a);
92        Out.pr("/>\n");
93      }// End if
94  
95      if (HTML.Tag.H1 == t){
96        Out.pr("<"+t);
97        printAttributes(a);
98        Out.pr("/>\n");
99      }// End if
100   } // handleSimpleTag
101 
102   /** This method is called when the HTML parser encounts text (PCDATA)*/
103   public void handleText(char[] text, int pos){
104 
105     if(HTML.Tag.A == currentTag){
106       //text of tag A
107       String tagText = new String(text);
108       Out.pr(tagText);
109     }// End if
110 
111     if(HTML.Tag.H1 == currentTag){
112       //text of tag A
113       String tagText = new String(text);
114       Out.pr(tagText);
115     }// End if
116 
117   }// end handleText();
118 
119   /**
120     * This method is called when the HTML parser encounts an error
121     * it depends on the programmer if he wants to deal with that error
122     */
123   public void handleError(String errorMsg, int pos) {
124     //Out.println ("ERROR CALLED : " + errorMsg);
125   }
126 
127   /** This method is called once, when the HTML parser reaches the end
128     * of its input streamin order to notify the parserCallback that there
129     * is nothing more to parse.
130     */
131   public void flush() throws BadLocationException{
132   }// flush
133 
134   /** This method is called when the HTML parser encounts a comment
135     */
136   public void handleComment(char[] text, int pos) {
137   }
138 
139   /**
140    * Given a certain folder it lists recursively all the files contained
141    * in that folder. It returns a list of strings representing the file
142    * names
143    */
144   private static List listAllFiles(File aFile, Set foldersToIgnore){
145     java.util.List sgmlFileNames = new ArrayList();
146     java.util.List foldersToExplore = new ArrayList();
147     if (!aFile.isDirectory()){
148       // add the file to the file list
149       sgmlFileNames.add(aFile.getPath());
150       return sgmlFileNames;
151     }// End if
152     listFilesRec(aFile,sgmlFileNames,foldersToExplore, foldersToIgnore);
153     return sgmlFileNames;
154   } // listAllFiles();
155 
156   /** Helper method for listAllFiles */
157   private static void listFilesRec(File aFile,
158                                   java.util.List fileNames,
159                                   java.util.List foldersToExplore,
160                                   Set foldersToIgnore){
161 
162     String[] fileList = aFile.list();
163     for (int i=0; i< fileList.length; i++){
164       File tmpFile = new File(aFile.getPath()+"\\"+fileList[i]);
165       if (tmpFile.isDirectory()){
166         // If the file is not included
167         if (!foldersToIgnore.contains(fileList[i]))
168           foldersToExplore.add(tmpFile);
169       }else{
170         // only process .html files
171         if(
172           ( fileList[i].toLowerCase().endsWith(".html") ) ||
173           ( fileList[i].toLowerCase().endsWith(".htm") )
174         ) fileNames.add(tmpFile.getPath());
175       }// End if
176     }// End for
177 
178     while(!foldersToExplore.isEmpty()){
179       File folder = (File)foldersToExplore.get(0);
180       foldersToExplore.remove(0);
181       listFilesRec(folder,fileNames,foldersToExplore,foldersToIgnore);
182     }//End while
183 
184   } // listFilesRec();
185 
186   /** Extract links from all .html files below a directory */
187   public static void main(String[] args){
188     HTMLEditorKit.Parser  parser = new ParserDelegator();
189     // create a new Htmldocument handler
190     HtmlLinksExtractor htmlDocHandler = new HtmlLinksExtractor();
191 
192     if (args.length == 0){
193       Out.prln(
194         "Eg: java HtmlLinksExtractor g:\\tmp\\relative javadoc img > results.txt"
195       );
196       return;
197     }
198     // Create a folder file File
199     File htmlFolder = new File(args[0]);
200     Set foldersToIgnore = new HashSet();
201     for(int i = 1; i<args.length; i++)
202       foldersToIgnore.add(args[i]);
203 
204     List htmlFileNames = listAllFiles(htmlFolder,foldersToIgnore);
205     while (!htmlFileNames.isEmpty()){
206       try{
207         String htmlFileName = (String) htmlFileNames.get(0);
208         htmlFileNames.remove(0);
209 
210         Reader reader = new FileReader(htmlFileName);
211         // parse the HTML document
212         parser.parse(reader, htmlDocHandler, true);
213       } catch (IOException e){
214         e.printStackTrace(System.out);
215       }// End try
216     }// End while
217     System.err.println("done.");
218   }// main
219 
220 }//End class HtmlLinksExtractor
221 
222 
223 
224