|
RepositioningInfo |
|
1 /* 2 * RepositioningInfo.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Angel Kirilov, 04/January/2002 12 * 13 * $Id: RepositioningInfo.java,v 1.8 2002/07/09 08:23:01 nasso Exp $ 14 */ 15 16 package gate.corpora; 17 18 import java.io.*; 19 import java.util.ArrayList; 20 21 import gate.util.*; 22 23 /** 24 * RepositioningInfo keep information about correspondence of positions 25 * between the original and extracted document content. With this information 26 * this class could be used for computing of this correspondence in the strict 27 * way (return -1 where is no correspondence) 28 * or in "flow" way (return near computable position) 29 */ 30 31 public class RepositioningInfo extends ArrayList { 32 33 /** Freeze the serialization UID. */ 34 static final long serialVersionUID = -2895662600168468559L; 35 /** Debug flag */ 36 private static final boolean DEBUG = false; 37 38 /** 39 * Just information keeper inner class. No significant functionality. 40 */ 41 public class PositionInfo implements Serializable { 42 43 /** Freeze the serialization UID. */ 44 static final long serialVersionUID = -7747351720249898499L; 45 46 /** Data members for one peace of text information */ 47 private long m_origPos, m_origLength, m_currPos, m_currLength; 48 49 /** The only constructor. We haven't set methods for data members. */ 50 public PositionInfo(long orig, long origLen, long curr, long currLen) { 51 m_origPos = orig; 52 m_origLength = origLen; 53 m_currPos = curr; 54 m_currLength = currLen; 55 } // PositionInfo 56 57 /** Position in the extracted (and probably changed) content */ 58 public long getCurrentPosition() { 59 return m_currPos; 60 } // getCurrentPosition 61 62 /** Position in the original content */ 63 public long getOriginalPosition() { 64 return m_origPos; 65 } // getOriginalPosition 66 67 /** Length of peace of text in the original content */ 68 public long getOriginalLength() { 69 return m_origLength; 70 } // getOriginalLength 71 72 /** Length of peace of text in the extracted content */ 73 public long getCurrentLength() { 74 return m_currLength; 75 } // getCurrentLength 76 77 /** For debug purposes */ 78 public String toString() { 79 return "("+m_origPos+","+m_origLength+"," 80 +m_currPos+","+m_currLength+")"; 81 } // toString 82 } // class PositionInfo 83 84 /** Default constructor */ 85 public RepositioningInfo() { 86 super(); 87 } // RepositioningInfo 88 89 /** Create a new position information record. */ 90 public void addPositionInfo(long origPos, long origLength, 91 long currPos, long currLength) { 92 // sorted add of new position 93 int insertPos = 0; 94 PositionInfo lastPI; 95 96 for(int i = size(); i>0; i--) { 97 lastPI = (PositionInfo) get(i-1); 98 if(lastPI.getOriginalPosition() < origPos) { 99 insertPos = i; 100 break; 101 } // if - sort key 102 } // for 103 104 add(insertPos, new PositionInfo(origPos, origLength, currPos, currLength)); 105 } // addPositionInfo 106 107 /** Compute position in extracted content by position in the original content. 108 * If there is no correspondence return -1. 109 */ 110 public long getExtractedPos(long absPos) { 111 long result = absPos; 112 PositionInfo currPI = null; 113 int size = size(); 114 115 if(size != 0) { 116 long origPos, origLen; 117 boolean found = false; 118 119 for(int i=0; i<size; ++i) { 120 currPI = (PositionInfo) get(i); 121 origPos = currPI.getOriginalPosition(); 122 origLen = currPI.getOriginalLength(); 123 124 if(absPos <= origPos+origLen) { 125 if(absPos < origPos) { 126 // outside the range of information 127 result = -1; 128 } 129 else { 130 // current position + offset in this PositionInfo record 131 result = currPI.getCurrentPosition() + absPos - origPos; 132 } // if 133 found = true; 134 break; 135 } // if 136 } // for 137 138 if(!found) { 139 // after the last repositioning info 140 result = -1; 141 } // if - !found 142 } // if 143 144 return result; 145 } // getExtractedPos 146 147 public long getOriginalPos(long relPos) { 148 return getOriginalPos(relPos, false); 149 } // getOriginalPos 150 151 /** Compute position in original content by position in the extracted content. 152 * If there is no correspondence return -1. 153 */ 154 public long getOriginalPos(long relPos, boolean afterChar) { 155 long result = relPos; 156 PositionInfo currPI = null; 157 int size = size(); 158 159 if(size != 0) { 160 long currPos, currLen; 161 boolean found = false; 162 163 for(int i=0; i<size; ++i) { 164 currPI = (PositionInfo) get(i); 165 currPos = currPI.getCurrentPosition(); 166 currLen = currPI.getCurrentLength(); 167 168 if(afterChar && relPos == currPos+currLen) { 169 result = currPI.getOriginalPosition() + currPI.getOriginalLength(); 170 found = true; 171 break; 172 } // if 173 174 if(relPos < currPos+currLen) { 175 if(relPos < currPos) { 176 // outside the range of information 177 result = -1; 178 } 179 else { 180 // current position + offset in this PositionInfo record 181 result = currPI.getOriginalPosition() + relPos - currPos; 182 } // if 183 found = true; 184 break; 185 } // if 186 } // for 187 188 if(!found) { 189 // after the last repositioning info 190 result = -1; 191 } // if - !found 192 } // if 193 194 return result; 195 } // getOriginalPos 196 197 /** Not finished yet */ 198 public long getExtractedPosFlow(long absPos) { 199 long result = -1; 200 return result; 201 } // getExtractedPosFlow 202 203 /** Not finished yet */ 204 public long getOriginalPosFlow(long relPos) { 205 long result = -1; 206 return result; 207 } // getOriginalPosFlow 208 209 /** 210 * Return the position info index containing <B>@param absPos</B> 211 * If there is no such position info return -1. 212 */ 213 public int getIndexByOriginalPosition(long absPos) { 214 PositionInfo currPI = null; 215 int result = -1; 216 217 int size = size(); 218 long origPos, origLen; 219 220 // Find with the liniear algorithm. Could be extended to binary search. 221 for(int i=0; i<size; ++i) { 222 currPI = (PositionInfo) get(i); 223 origPos = currPI.getOriginalPosition(); 224 origLen = currPI.getOriginalLength(); 225 226 if(absPos <= origPos+origLen) { 227 if(absPos >= origPos) { 228 result = i; 229 } // if 230 break; 231 } // if 232 } // for 233 234 return result; 235 } // getItemByOriginalPosition 236 237 /** 238 * Return the position info index containing <B>@param absPos</B> 239 * or the index of record before this position. 240 * Result is -1 if the position is before the first record. 241 * Rezult is size() if the position is after the last record. 242 */ 243 public int getIndexByOriginalPositionFlow(long absPos) { 244 PositionInfo currPI = null; 245 246 int size = size(); 247 int result = size; 248 long origPos, origLen; 249 250 // Find with the liniear algorithm. Could be extended to binary search. 251 for(int i=0; i<size; ++i) { 252 currPI = (PositionInfo) get(i); 253 origPos = currPI.getOriginalPosition(); 254 origLen = currPI.getOriginalLength(); 255 256 if(absPos <= origPos+origLen) { 257 // is inside of current record 258 if(absPos >= origPos) { 259 result = i; 260 } 261 else { 262 // not inside the current recort - return previous 263 result = i-1; 264 } // if 265 break; 266 } // if 267 } // for 268 269 return result; 270 } // getItemByOriginalPositionFlow 271 272 /** 273 * Correct the RepositioningInfo structure for shrink/expand changes. 274 * <br> 275 * 276 * Normaly the text peaces have same sizes in both original text and 277 * extracted text. But in some cases there are nonlinear substitutions. 278 * For example the sequence "<" is converted to "<". 279 * <br> 280 * 281 * The correction will split the corresponding PositionInfo structure to 282 * 3 new records - before correction, correction record and after correction. 283 * Front and end records are the same maner like the original record - 284 * m_origLength == m_currLength, since the middle record has different 285 * values because of shrink/expand changes. All records after this middle 286 * record should be corrected with the difference between these values. 287 * <br> 288 * 289 * All m_currPos above the current information record should be corrected 290 * with (origLen - newLen) i.e. 291 * <code> m_currPos -= origLen - newLen; </code> 292 * <br> 293 * 294 * @param originalPos Position of changed text in the original content. 295 * @param origLen Length of changed peace of text in the original content. 296 * @param newLen Length of new peace of text substiting the original peace. 297 */ 298 public void correctInformation(long originalPos, long origLen, long newLen) { 299 PositionInfo currPI; 300 PositionInfo frontPI, correctPI, endPI; 301 302 int index = getIndexByOriginalPositionFlow(originalPos); 303 304 // correct the index when the originalPos precede all records 305 if(index == -1) { 306 index = 0; 307 } // if 308 309 // correction of all other information records 310 // All m_currPos above the current record should be corrected with 311 // (origLen - newLen) i.e. <code> m_currPos -= origLen - newLen; </code> 312 313 for(int i=index; i<size(); ++i) { 314 currPI = (PositionInfo) get(i); 315 currPI.m_currPos -= origLen - newLen; 316 } // for 317 318 currPI = (PositionInfo) get(index); 319 if(originalPos >= currPI.m_origPos 320 && currPI.m_origPos + currPI.m_origLength >= originalPos + origLen) { 321 long frontLen = originalPos - currPI.m_origPos; 322 323 frontPI = new PositionInfo(currPI.m_origPos, 324 frontLen, 325 currPI.m_currPos, 326 frontLen); 327 correctPI = new PositionInfo(originalPos, 328 origLen, 329 currPI.m_currPos + frontLen, 330 newLen); 331 long endLen = currPI.m_origLength - frontLen - origLen; 332 endPI = new PositionInfo(originalPos + origLen, 333 endLen, 334 currPI.m_currPos + frontLen + newLen, 335 endLen); 336 337 set(index, frontPI); // substitute old element 338 if(endPI.m_origLength > 0) { 339 add(index+1, endPI); // insert new end element 340 } // if 341 if(correctPI.m_origLength > 0) { 342 add(index+1, correctPI); // insert middle new element 343 } // if 344 } // if - substitution range check 345 } // correctInformation 346 347 /** 348 * Correct the original position information in the records. When some text 349 * is shrinked/expanded by the parser. With this method is corrected the 350 * substitution of "\r\n" with "\n". 351 */ 352 public void correctInformationOriginalMove(long originalPos, long moveLen) { 353 PositionInfo currPI; 354 355 if(DEBUG) { 356 if(originalPos < 380) // debug information restriction 357 Out.println("Before correction: "+this); 358 } // DEBUG 359 360 int index = getIndexByOriginalPositionFlow(originalPos); 361 362 // correct the index when the originalPos precede all records 363 if(index == -1) { 364 index = 0; 365 } // if 366 367 // position is after all records in list 368 if(index == size()) { 369 return; 370 } // if 371 372 for(int i = index+1; i<size(); ++i) { 373 currPI = (PositionInfo) get(i); 374 currPI.m_origPos += moveLen; 375 } // for 376 377 currPI = (PositionInfo) get(index); 378 379 // should we split this record to two new records (inside the record) 380 if(originalPos > currPI.m_origPos) { 381 if(originalPos < currPI.m_origPos + currPI.m_origLength) { 382 PositionInfo frontPI, endPI; 383 long frontLen = originalPos - currPI.m_origPos; 384 frontPI = new PositionInfo(currPI.m_origPos, 385 frontLen, 386 currPI.m_currPos, 387 frontLen); 388 389 long endLen = currPI.m_origLength - frontLen; 390 endPI = new PositionInfo(originalPos + moveLen, 391 endLen, 392 currPI.m_currPos + frontLen, 393 endLen); 394 set(index, frontPI); // substitute old element 395 if(endPI.m_origLength != 0) { 396 add(index+1, endPI); // insert new end element 397 } // if - should add this record 398 399 if(DEBUG) { 400 if(originalPos < 380) { // debug information restriction 401 Out.println("Point 2. Current: "+currPI); 402 Out.println("Point 2. frontPI: "+frontPI); 403 Out.println("Point 2. endPI: "+endPI); 404 } 405 } // DEBUG 406 } // if - inside the record 407 } // if 408 else { 409 // correction if the position is before the current record 410 currPI.m_origPos += moveLen; 411 } 412 413 if(DEBUG) { 414 if(originalPos < 380) { 415 Out.println("Correction move: "+originalPos+", "+moveLen); 416 Out.println("Corrected: "+this); 417 Out.println("index: "+index); 418 /* 419 Exception ex = new Exception(); 420 Out.println("Call point: "); 421 ex.printStackTrace(); 422 */ 423 } 424 } // DEBUG 425 } // correctInformationOriginalMove 426 427 } // class RepositioningInfo
|
RepositioningInfo |
|