|
RepositioningInfo |
|
1 /* 2 * RepositioningInfo.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Angel Kirilov, 04/January/2002 12 * 13 * $Id: RepositioningInfo.java,v 1.6 2002/01/30 14:49:37 nasso Exp $ 14 */ 15 16 package gate.corpora; 17 18 import java.io.*; 19 import java.util.ArrayList; 20 21 import gate.util.*; 22 23 /** 24 * RepositioningInfo keep information about correspondence of positions 25 * between the original and extracted document content. With this information 26 * this class could be used for computing of this correspondence in the strict 27 * way (return -1 where is no correspondence) 28 * or in "flow" way (return near computable position) 29 */ 30 31 public class RepositioningInfo extends ArrayList { 32 33 /** Freeze the serialization UID. */ 34 static final long serialVersionUID = -2895662600168468559L; 35 36 /** 37 * Just information keeper inner class. No significant functionality. 38 */ 39 public class PositionInfo implements Serializable { 40 41 /** Freeze the serialization UID. */ 42 static final long serialVersionUID = -7747351720249898499L; 43 44 /** Data members for one peace of text information */ 45 private long m_origPos, m_origLength, m_currPos, m_currLength; 46 47 /** The only constructor. We haven't set methods for data members. */ 48 public PositionInfo(long orig, long origLen, long curr, long currLen) { 49 m_origPos = orig; 50 m_origLength = origLen; 51 m_currPos = curr; 52 m_currLength = currLen; 53 } // PositionInfo 54 55 /** Position in the extracted (and probably changed) content */ 56 public long getCurrentPosition() { 57 return m_currPos; 58 } // getCurrentPosition 59 60 /** Position in the original content */ 61 public long getOriginalPosition() { 62 return m_origPos; 63 } // getOriginalPosition 64 65 /** Length of peace of text in the original content */ 66 public long getOriginalLength() { 67 return m_origLength; 68 } // getOriginalLength 69 70 /** Length of peace of text in the extracted content */ 71 public long getCurrentLength() { 72 return m_currLength; 73 } // getCurrentLength 74 75 /** For debug purposes */ 76 public String toString() { 77 return "("+m_origPos+","+m_origLength+"," 78 +m_currPos+","+m_currLength+")"; 79 } // toString 80 } // class PositionInfo 81 82 /** Default constructor */ 83 public RepositioningInfo() { 84 super(); 85 } // RepositioningInfo 86 87 /** Create a new position information record. */ 88 public void addPositionInfo(long origPos, long origLength, 89 long currPos, long currLength) { 90 // sorted add of new position 91 int insertPos = 0; 92 PositionInfo lastPI; 93 94 for(int i = size(); i>0; i--) { 95 lastPI = (PositionInfo) get(i-1); 96 if(lastPI.getOriginalPosition() < origPos) { 97 insertPos = i; 98 break; 99 } // if - sort key 100 } // for 101 102 add(insertPos, new PositionInfo(origPos, origLength, currPos, currLength)); 103 } // addPositionInfo 104 105 /** Compute position in extracted content by position in the original content. 106 * If there is no correspondence return -1. 107 */ 108 public long getExtractedPos(long absPos) { 109 long result = absPos; 110 PositionInfo currPI = null; 111 int size = size(); 112 113 if(size != 0) { 114 long origPos, origLen; 115 boolean found = false; 116 117 for(int i=0; i<size; ++i) { 118 currPI = (PositionInfo) get(i); 119 origPos = currPI.getOriginalPosition(); 120 origLen = currPI.getOriginalLength(); 121 122 if(absPos <= origPos+origLen) { 123 if(absPos < origPos) { 124 // outside the range of information 125 result = -1; 126 } 127 else { 128 // current position + offset in this PositionInfo record 129 result = currPI.getCurrentPosition() + absPos - origPos; 130 } // if 131 found = true; 132 break; 133 } // if 134 } // for 135 136 if(!found) { 137 // after the last repositioning info 138 result = -1; 139 } // if - !found 140 } // if 141 142 return result; 143 } // getExtractedPos 144 145 public long getOriginalPos(long relPos) { 146 return getOriginalPos(relPos, false); 147 } // getOriginalPos 148 149 /** Compute position in original content by position in the extracted content. 150 * If there is no correspondence return -1. 151 */ 152 public long getOriginalPos(long relPos, boolean afterChar) { 153 long result = relPos; 154 PositionInfo currPI = null; 155 int size = size(); 156 157 if(size != 0) { 158 long currPos, currLen; 159 boolean found = false; 160 161 for(int i=0; i<size; ++i) { 162 currPI = (PositionInfo) get(i); 163 currPos = currPI.getCurrentPosition(); 164 currLen = currPI.getCurrentLength(); 165 166 if(afterChar && relPos == currPos+currLen) { 167 result = currPI.getOriginalPosition() + currPI.getOriginalLength(); 168 found = true; 169 break; 170 } // if 171 172 if(relPos < currPos+currLen) { 173 if(relPos < currPos) { 174 // outside the range of information 175 result = -1; 176 } 177 else { 178 // current position + offset in this PositionInfo record 179 result = currPI.getOriginalPosition() + relPos - currPos; 180 } // if 181 found = true; 182 break; 183 } // if 184 } // for 185 186 if(!found) { 187 // after the last repositioning info 188 result = -1; 189 } // if - !found 190 } // if 191 192 return result; 193 } // getOriginalPos 194 195 /** Not finished yet */ 196 public long getExtractedPosFlow(long absPos) { 197 long result = -1; 198 return result; 199 } // getExtractedPosFlow 200 201 /** Not finished yet */ 202 public long getOriginalPosFlow(long relPos) { 203 long result = -1; 204 return result; 205 } // getOriginalPosFlow 206 207 /** 208 * Return the position info index containing <B>@param absPos</B> 209 * If there is no such position info return -1. 210 */ 211 public int getIndexByOriginalPosition(long absPos) { 212 PositionInfo currPI = null; 213 int result = -1; 214 215 int size = size(); 216 long origPos, origLen; 217 218 // Find with the liniear algorithm. Could be extended to binary search. 219 for(int i=0; i<size; ++i) { 220 currPI = (PositionInfo) get(i); 221 origPos = currPI.getOriginalPosition(); 222 origLen = currPI.getOriginalLength(); 223 224 if(absPos <= origPos+origLen) { 225 if(absPos >= origPos) { 226 result = i; 227 } // if 228 break; 229 } // if 230 } // for 231 232 return result; 233 } // getItemByOriginalPosition 234 235 /** 236 * Return the position info index containing <B>@param absPos</B> 237 * or the index of record before this position. 238 * Result is -1 if the position is before the first record. 239 * Rezult is size() if the position is after the last record. 240 */ 241 public int getIndexByOriginalPositionFlow(long absPos) { 242 PositionInfo currPI = null; 243 244 int size = size(); 245 int result = size; 246 long origPos, origLen; 247 248 // Find with the liniear algorithm. Could be extended to binary search. 249 for(int i=0; i<size; ++i) { 250 currPI = (PositionInfo) get(i); 251 origPos = currPI.getOriginalPosition(); 252 origLen = currPI.getOriginalLength(); 253 254 if(absPos <= origPos+origLen) { 255 // is inside of current record 256 if(absPos >= origPos) { 257 result = i; 258 } 259 else { 260 // not inside the current recort - return previous 261 result = i-1; 262 } // if 263 break; 264 } // if 265 } // for 266 267 return result; 268 } // getItemByOriginalPositionFlow 269 270 /** 271 * Correct the RepositioningInfo structure for shrink/expand changes. 272 * <br> 273 * 274 * Normaly the text peaces have same sizes in both original text and 275 * extracted text. But in some cases there are nonlinear substitutions. 276 * For example the sequence "<" is converted to "<". 277 * <br> 278 * 279 * The correction will split the corresponding PositionInfo structure to 280 * 3 new records - before correction, correction record and after correction. 281 * Front and end records are the same maner like the original record - 282 * m_origLength == m_currLength, since the middle record has different 283 * values because of shrink/expand changes. All records after this middle 284 * record should be corrected with the difference between these values. 285 * <br> 286 * 287 * All m_currPos above the current information record should be corrected 288 * with (origLen - newLen) i.e. 289 * <code> m_currPos -= origLen - newLen; </code> 290 * <br> 291 * 292 * @param originalPos Position of changed text in the original content. 293 * @param origLen Length of changed peace of text in the original content. 294 * @param newLen Length of new peace of text substiting the original peace. 295 */ 296 public void correctInformation(long originalPos, long origLen, long newLen) { 297 PositionInfo currPI; 298 PositionInfo frontPI, correctPI, endPI; 299 300 int index = getIndexByOriginalPositionFlow(originalPos); 301 302 // correct the index when the originalPos precede all records 303 if(index == -1) { 304 index = 0; 305 } // if 306 307 // correction of all other information records 308 // All m_currPos above the current record should be corrected with 309 // (origLen - newLen) i.e. <code> m_currPos -= origLen - newLen; </code> 310 311 for(int i=index; i<size(); ++i) { 312 currPI = (PositionInfo) get(i); 313 currPI.m_currPos -= origLen - newLen; 314 } // for 315 316 currPI = (PositionInfo) get(index); 317 if(originalPos >= currPI.m_origPos 318 && currPI.m_origPos + currPI.m_origLength >= originalPos + origLen) { 319 long frontLen = originalPos - currPI.m_origPos; 320 321 frontPI = new PositionInfo(currPI.m_origPos, 322 frontLen, 323 currPI.m_currPos, 324 frontLen); 325 correctPI = new PositionInfo(originalPos, 326 origLen, 327 currPI.m_currPos + frontLen, 328 newLen); 329 long endLen = currPI.m_origLength - frontLen - origLen; 330 endPI = new PositionInfo(originalPos + origLen, 331 endLen, 332 currPI.m_currPos + frontLen + newLen, 333 endLen); 334 335 set(index, frontPI); // substitute old element 336 if(endPI.m_origLength != 0) { 337 add(index+1, endPI); // insert new end element 338 } // if 339 add(index+1, correctPI); // insert middle new element 340 } // if - substitution range check 341 } // correctInformation 342 343 /** 344 * Correct the original position information in the records. When some text 345 * is shrinked/expanded by the parser. With this method is corrected the 346 * substitution of "\r\n" with "\n". 347 */ 348 public void correctInformationOriginalMove(long originalPos, long moveLen) { 349 PositionInfo currPI; 350 351 int index = getIndexByOriginalPositionFlow(originalPos); 352 353 // correct the index when the originalPos precede all records 354 if(index == -1) { 355 index = 0; 356 } // if 357 358 // position is after all records in list 359 if(index == size()) { 360 return; 361 } // if 362 363 for(int i = index+1; i<size(); ++i) { 364 currPI = (PositionInfo) get(i); 365 currPI.m_origPos += moveLen; 366 } // for 367 368 currPI = (PositionInfo) get(index); 369 370 // should we split this record to two new records (inside the record) 371 if(originalPos > currPI.m_origPos) { 372 if(originalPos < currPI.m_origPos + currPI.m_origLength) { 373 PositionInfo frontPI, endPI; 374 long frontLen = originalPos - currPI.m_origPos; 375 frontPI = new PositionInfo(currPI.m_origPos, 376 frontLen, 377 currPI.m_currPos, 378 frontLen); 379 380 long endLen = currPI.m_origLength - frontLen; 381 endPI = new PositionInfo(originalPos + frontLen + moveLen, 382 endLen, 383 currPI.m_currPos + frontLen, 384 endLen); 385 set(index, frontPI); // substitute old element 386 if(endPI.m_origLength != 0) { 387 add(index+1, endPI); // insert new end element 388 } // if - should add this record 389 } // if - inside the record 390 } // if 391 else { 392 // correction if the position is before the current record 393 currPI.m_origPos += moveLen; 394 } 395 } // correctInformationOriginalMove 396 397 } // class RepositioningInfo
|
RepositioningInfo |
|