cbf_lex.c

Go to the documentation of this file.
00001 /**********************************************************************
00002  * cbf_lex -- lexical scanner for CBF tokens                          *
00003  *                                                                    *
00004  * Version 0.6 13 January 1999                                        *
00005  *                                                                    *
00006  *            Paul Ellis (ellis@ssrl.slac.stanford.edu) and           *
00007  *         Herbert J. Bernstein (yaya@bernstein-plus-sons.com)        *
00008  **********************************************************************/
00009   
00010 /**********************************************************************
00011  *                               NOTICE                               *
00012  * Creative endeavors depend on the lively exchange of ideas. There   *
00013  * are laws and customs which establish rights and responsibilities   *
00014  * for authors and the users of what authors create.  This notice     *
00015  * is not intended to prevent you from using the software and         *
00016  * documents in this package, but to ensure that there are no         *
00017  * misunderstandings about terms and conditions of such use.          *
00018  *                                                                    *
00019  * Please read the following notice carefully.  If you do not         *
00020  * understand any portion of this notice, please seek appropriate     *
00021  * professional legal advice before making use of the software and    *
00022  * documents included in this software package.  In addition to       *
00023  * whatever other steps you may be obliged to take to respect the     *
00024  * intellectual property rights of the various parties involved, if   *
00025  * you do make use of the software and documents in this package,     *
00026  * please give credit where credit is due by citing this package,     *
00027  * its authors and the URL or other source from which you obtained    *
00028  * it, or equivalent primary references in the literature with the    *
00029  * same authors.                                                      *
00030  *                                                                    *
00031  * Some of the software and documents included within this software   *
00032  * package are the intellectual property of various parties, and      *
00033  * placement in this package does not in any way imply that any       *
00034  * such rights have in any way been waived or diminished.             *
00035  *                                                                    *
00036  * With respect to any software or documents for which a copyright    *
00037  * exists, ALL RIGHTS ARE RESERVED TO THE OWNERS OF SUCH COPYRIGHT.   *
00038  *                                                                    *
00039  * Even though the authors of the various documents and software      *
00040  * found here have made a good faith effort to ensure that the        *
00041  * documents are correct and that the software performs according     *
00042  * to its documentation, and we would greatly appreciate hearing of   *
00043  * any problems you may encounter, the programs and documents any     *
00044  * files created by the programs are provided **AS IS** without any   *
00045  * warranty as to correctness, merchantability or fitness for any     *
00046  * particular or general use.                                         *
00047  *                                                                    *
00048  * THE RESPONSIBILITY FOR ANY ADVERSE CONSEQUENCES FROM THE USE OF    *
00049  * PROGRAMS OR DOCUMENTS OR ANY FILE OR FILES CREATED BY USE OF THE   *
00050  * PROGRAMS OR DOCUMENTS LIES SOLELY WITH THE USERS OF THE PROGRAMS   *
00051  * OR DOCUMENTS OR FILE OR FILES AND NOT WITH AUTHORS OF THE          *
00052  * PROGRAMS OR DOCUMENTS.                                             *
00053  **********************************************************************/
00054  
00055 /**********************************************************************
00056  *                          The IUCr Policy                           *
00057  *                                 on                                 *
00058  *     the Use of the Crystallographic Information File (CIF)         *
00059  *                                                                    *
00060  * The Crystallographic Information File (Hall, Allen & Brown,        *
00061  * 1991) is, as of January 1992, the recommended method for           *
00062  * submitting publications to Acta Crystallographica Section C. The   *
00063  * International Union of Crystallography holds the Copyright on      *
00064  * the CIF, and has applied for Patents on the STAR File syntax       *
00065  * which is the basis for the CIF format.                             *
00066  *                                                                    *
00067  * It is a principal objective of the IUCr to promote the use of      *
00068  * CIF for the exchange and storage of scientific data. The IUCr's    *
00069  * sponsorship of the CIF development was motivated by its            *
00070  * responsibility to its scientific journals, which set the           *
00071  * standards in crystallographic publishing. The IUCr intends that    *
00072  * CIFs will be used increasingly for electronic submission of        *
00073  * manuscripts to these journals in future. The IUCr recognises       *
00074  * that, if the CIF and the STAR File are to be adopted as a means    *
00075  * for universal data exchange, the syntax of these files must be     *
00076  * strictly and uniformly adhered to. Even small deviations from      *
00077  * the syntax would ultimately cause the demise of the universal      *
00078  * file concept. Through its Copyrights and Patents the IUCr has      *
00079  * taken the steps needed to ensure strict conformance with this      *
00080  * syntax.                                                            *
00081  *                                                                    *
00082  * The IUCr policy on the use of the CIF and STAR File processes is   *
00083  * as follows:                                                        *
00084  * _________________________________________________________________  *
00085  *                                                                    *
00086  *  * 1 CIFs and STAR Files may be generated, stored or transmitted,  *
00087  *    without permission or charge, provided their purpose is not     *
00088  *    specifically for profit or commercial gain, and provided that   *
00089  *    the published syntax is strictly adhered to.                    *
00090  *  * 2 Computer software may be developed for use with CIFs or STAR  *
00091  *    files, without permission or charge, provided it is distributed *
00092  *    in the public domain. This condition also applies to software   *
00093  *    for which a charge is made, provided that its primary function  *
00094  *    is for use with files that satisfy condition 1 and that it is   *
00095  *    distributed as a minor component of a larger package of         *
00096  *    software.                                                       *
00097  *  * 3 Permission will be granted for the use of CIFs and STAR Files *
00098  *    for specific commercial purposes (such as databases or network  *
00099  *    exchange processes), and for the distribution of commercial     *
00100  *    CIF/STAR software, on written application to the IUCr Executive *
00101  *    Secretary, 2 Abbey Square, Chester CH1 2HU, England. The        *
00102  *    nature, terms and duration of the licences granted will be      *
00103  *    determined by the IUCr Executive and Finance Committees.        *
00104  *                                                                    *
00105  * _________________________________________________________________  *
00106  *                                                                    *
00107  * In summary, the IUCr wishes to promote the use of the STAR File    *
00108  * concepts as a standard universal data file. It will insist on      *
00109  * strict compliance with the published syntax for all                *
00110  * applications. To assist with this compliance, the IUCr provides    *
00111  * public domain software for checking the logical integrity of a     *
00112  * CIF, and for validating the data name definitions contained        *
00113  * within a CIF. Detailed information on this software, and the       *
00114  * associated dictionaries, may be obtained from the IUCr Office at   *
00115  * 5 Abbey Square, Chester CH1 2HU, England.                          *
00116  **********************************************************************/
00117 
00118 #ifdef __cplusplus
00119 
00120 extern "C" {
00121 
00122 #endif
00123 
00124 #include "cbf.h"
00125 #include "cbf_compress.h"
00126 #include "cbf_lex.h"
00127 #include "cbf_codes.h"
00128 #include "cbf_file.h"
00129 #include "cbf_string.h"
00130 #include "cbf_read_binary.h"
00131 #include "cbf_read_mime.h"
00132 
00133 #include <stdlib.h>
00134 #include <string.h>
00135 #include <ctype.h>
00136 
00137 
00138   /* Return an error code */
00139   
00140 #define cbf_errornez(f,v) { if (((v)->errorcode = (f)) != 0) return ERROR; }
00141 
00142 
00143   /* Return a copy of the text */
00144 
00145 int cbf_return_text (int code, YYSTYPE *val, const char *text, char type)
00146 {
00147   val->text = cbf_copy_string (NULL, text, type);
00148 
00149   if (!val->text)
00150   {
00151     val->errorcode = CBF_ALLOC;
00152 
00153     return ERROR;
00154   }
00155 
00156   return code;
00157 }
00158 
00159 
00160   /* Get the next token */
00161 
00162 int cbf_lex (YYSTYPE *val, cbf_file *file)
00163 {
00164   int data, loop, item, column, comment, string, ascii, 
00165       l, c, count, reprocess, errorcode, mime, encoding, bits, sign,
00166       checked_digest;
00167 
00168   long id, position;
00169   
00170   unsigned int file_column, compression;
00171   
00172   size_t size, length, code_size;
00173   
00174   const char *line;
00175     
00176   char out_line [(((sizeof (void *) +
00177                     sizeof (long int) * 2 +
00178                     sizeof (int) * 3) * CHAR_BIT) >> 2) + 55];
00179 
00180   char digest [25], new_digest [25];
00181 
00182 
00183   cbf_errornez (cbf_reset_buffer (file), val)
00184   
00185   l = c = file->last_read;
00186   
00187   column = c == '.';
00188   
00189   comment = c == '#';
00190   
00191   reprocess = (column || comment);
00192   
00193   data = loop = item = string = !reprocess;
00194   
00195   comment = !column;
00196   
00197   do
00198   {
00199     cbf_errornez (cbf_get_buffer (file, &line, &length), val)
00200     
00201     if (reprocess)
00202 
00203       reprocess = 0;
00204 
00205     else
00206     {
00207       l = c;
00208 
00209       c = cbf_read_character (file);
00210     }
00211     
00212 
00213       /* Discard spaces ([[:space:]]+) */
00214 
00215     if (length == 0)
00216 
00217       if (isspace (c))
00218 
00219          continue;
00220             
00221         
00222        /* DATA ([Dd][Aa][Tt][Aa][_][^[:space:]]*) */
00223     
00224     if (data)
00225 
00226       if (length < 5)
00227 
00228          data = toupper (c) == "DATA_" [length];
00229 
00230       else
00231 
00232         if (isspace (c) || c == EOF)
00233 
00234           return cbf_return_text (DATA, val, &line [5], 0);
00235    
00236    
00237        /* LOOP ([Ll][Oo][Oo][Pp][_]) */
00238      
00239     if (loop)
00240     {
00241       loop = toupper (c) == "LOOP_" [length];
00242 
00243       if (loop && length == 4)
00244 
00245         return LOOP;
00246     }
00247 
00248    
00249        /* ITEM ([_][^[:space:]\.]+) */
00250      
00251     if (item)
00252 
00253       if (length == 0)
00254 
00255         item = c == '_';
00256 
00257       else
00258       {
00259         item = !isspace (c) && c != '.' && c != '#' && c != EOF;
00260 
00261         if (length >= 2 && !item)
00262 
00263           if (c == '.')
00264 
00265             return cbf_return_text (CATEGORY, val, &line [1], 0);
00266             
00267           else
00268 
00269             return cbf_return_text (ITEM, val, &line [1], 0);
00270       }
00271 
00272    
00273       /* COLUMN (\.[^[:space:]]+) */
00274      
00275     if (column)
00276 
00277       if (isspace (c) || c == EOF)
00278 
00279         return cbf_return_text (COLUMN, val, &line [1], 0);
00280 
00281   
00282       /* STRING ([\'][^'\n]*[\'\n])|(([\"][^"\n]*[\"\n])) */
00283      
00284     if (string)
00285 
00286       if (length == 0)
00287 
00288         string = c == '\'' || c == '"';
00289 
00290       else
00291 
00292         if (c == line [0] || c == '\n' || c == EOF)
00293 
00294           if (line [0] == '\'')
00295 
00296             return cbf_return_text (STRING, val, &line [1], 
00297                                                   CBF_TOKEN_SQSTRING);
00298             
00299           else
00300 
00301             return cbf_return_text (STRING, val, &line [1],
00302                                                   CBF_TOKEN_DQSTRING);
00303 
00304 
00305        /* COMMENT ([#][^\n]*) */
00306      
00307     if (comment)
00308 
00309       if (length == 0)
00310 
00311         comment = c == '#';
00312 
00313       else
00314 
00315         if (c == '\n' || c == EOF)
00316 
00317           return cbf_return_text (COMMENT, val, &line [1], 0);
00318 
00319 
00320        /* WORD ([^[:space:]]+) */
00321      
00322     if (!data && !loop && !item && !comment && !string && !column)
00323 
00324       if (length && (isspace (c) || c == EOF))
00325 
00326           /* Missing value? */
00327 
00328         if (length == 1 && (line [0] == '?' || line [0] == '.'))
00329           
00330           return cbf_return_text (WORD, val, &line [0], CBF_TOKEN_NULL);
00331           
00332         else
00333         
00334           return cbf_return_text (WORD, val, &line [0], CBF_TOKEN_WORD);
00335 
00336 
00337       /* semicolon-delimited STRING (^;[^\n]*[\n])([^;][^\n]*[\n])*(;) */
00338       
00339     if (length == 0 && c == ';')
00340     {
00341       cbf_errornez (cbf_get_filecoordinates (file, NULL, &file_column), val)
00342 
00343       if (file_column == 1)
00344       {
00345           /* Save the position */
00346 
00347         cbf_errornez (cbf_get_fileposition (file, &position), val)
00348         
00349         mime = 0;
00350 
00351         do
00352         {
00353             /* Save the character */
00354             
00355           cbf_errornez (cbf_save_character (file, c), val)
00356           
00357           
00358             /* Check for a Mime boundary */
00359             
00360           if (c == '-')
00361           {
00362             cbf_errornez (cbf_get_buffer (file, &line, &length), val)
00363 
00364             cbf_nblen (line, &length);
00365             
00366             if (length > 29)
00367 
00368               mime = cbf_cistrcmp (&line [length - 30], 
00369                                    "\n--CIF-BINARY-FORMAT-SECTION--")
00370                                     == 0;
00371           }
00372 
00373 
00374             /* Read the next character */
00375             
00376           l = c;
00377          
00378           c = cbf_read_character (file);
00379           
00380           ascii = isgraph (c) || isspace (c);
00381         }
00382         while ((l != '\n' || c != ';') && !mime && ascii);
00383 
00384 
00385           /* Plain ASCII string */
00386           
00387         if (!mime && ascii)
00388         {
00389           cbf_errornez (cbf_get_buffer (file, &line, &length), val)
00390         
00391           ((char *) line) [length - 1] = '\0';
00392 
00393 
00394             /* Convert "\n\\;" -> "\n;" */
00395 
00396           for (count = 0; line [count]; count++)
00397 
00398             if (strncmp (&line [count], "\n\\;", 3) == 0)
00399 
00400               memmove ((void *) &line [count + 1], 
00401                        (void *) &line [count + 2], length - count - 2);
00402 
00403           return cbf_return_text (STRING, val, &line [1], 
00404                                                   CBF_TOKEN_SCSTRING);
00405         }
00406     
00407         encoding = ENC_NONE;
00408           
00409         bits = 0;
00410         
00411         sign = -1;
00412         
00413         checked_digest = 0;
00414         
00415 
00416           /* Mime header */
00417           
00418         if (mime)
00419         {
00420             /* Position */
00421           
00422           cbf_errornez (cbf_get_fileposition (file, &position), val)
00423           
00424         
00425             /* Read the header */
00426 
00427           cbf_errornez (cbf_parse_mimeheader (file, &encoding,
00428                                                     &size,
00429                                                     &id,
00430                                                     digest,
00431                                                     &compression,
00432                                                     &bits,
00433                                                     &sign), val)
00434 
00435 
00436             /* Check the digest? */
00437             
00438           if ((file->read_headers & MSG_DIGESTNOW) && 
00439                                     cbf_is_base64digest (digest))
00440           {
00441               /* Recalculate the digest (note that this will decode the
00442                  binary section but not save the result so this section
00443                  is not very efficient) */
00444               
00445             code_size = 0;
00446 
00447             switch (encoding)
00448             {
00449               case ENC_QP:
00450     
00451                 cbf_errornez (cbf_fromqp (file, NULL, size, &code_size, 
00452                                                          new_digest), val)
00453 
00454                 break;
00455       
00456               case ENC_BASE64:
00457     
00458                 cbf_errornez (cbf_frombase64 (file, NULL, size, &code_size, 
00459                                                          new_digest), val)
00460 
00461                 break;
00462       
00463               case ENC_BASE8:
00464               case ENC_BASE10:
00465               case ENC_BASE16:
00466     
00467                 cbf_errornez (cbf_frombasex (file, NULL, size, &code_size, 
00468                                                          new_digest),val)
00469 
00470                 break;
00471 
00472               case ENC_NONE:
00473 
00474                 cbf_errornez (cbf_parse_binaryheader (file, NULL, \
00475                                                             NULL, \
00476                                                             NULL, \
00477                                                             mime), val)
00478 
00479                 code_size = size;
00480 
00481                 cbf_errornez (cbf_get_fileposition (file, &position), val)
00482 
00483                 cbf_errornez (cbf_md5digest (file, code_size, new_digest), 
00484                                                               val)
00485                                                                   
00486                 break;
00487 
00488              default:
00489     
00490                cbf_errornez (CBF_FORMAT, val)
00491             }
00492             
00493             
00494               /* Check the number of characters read */
00495 
00496             if ((size && (size != code_size)) || code_size == 0)
00497 
00498               cbf_errornez (CBF_FORMAT, val)
00499               
00500 
00501               /* Compare the old digest to the new one */
00502 
00503             if (strcmp (digest, new_digest) != 0)
00504             
00505               cbf_errornez (CBF_FORMAT | 2, val)
00506 
00507             checked_digest = 1;
00508           }
00509           else
00510           {
00511               /* Calculate the minimum number of characters in the data */
00512               
00513             if (encoding == ENC_NONE)
00514             {
00515               cbf_errornez (cbf_parse_binaryheader (file, NULL, NULL, NULL, \
00516                                                                   mime), val)
00517         
00518               cbf_errornez (cbf_get_fileposition (file, &position), val)
00519 
00520               code_size = size;
00521             }
00522             else
00523             
00524               if (encoding == ENC_QP)
00525 
00526                 code_size = size;
00527               
00528               else
00529             
00530                 if (encoding == ENC_BASE64)
00531               
00532                   code_size = size * 8 / 6;
00533                 
00534                 else
00535           
00536                   code_size = size / 4;
00537 
00538 
00539               /* Skip to the end of the data */
00540 
00541             cbf_errornez (cbf_set_fileposition (file, code_size, SEEK_CUR), 
00542                                                       val)
00543           }
00544         }
00545         else
00546         {
00547             /* Simple binary */
00548                       
00549           cbf_errornez (cbf_parse_binaryheader (file, &size, \
00550                                                       &id,   \
00551                                                       &compression, mime), val)
00552         
00553           cbf_errornez (cbf_get_fileposition (file, &position), val)
00554 
00555           code_size = size;
00556 
00557 
00558             /* Skip to the end of the data */
00559 
00560           cbf_errornez (cbf_set_fileposition (file, code_size, SEEK_CUR), val)
00561         }
00562 
00563 
00564           /* Find the terminating semi-colon */
00565 
00566         c = 0;
00567           
00568         do
00569         {
00570           l = c;
00571         
00572           c = cbf_read_character (file);
00573           
00574           if (c == EOF)
00575           
00576             cbf_errornez (CBF_FILEREAD, val)
00577         }
00578         while (l != '\n' || c != ';');
00579 
00580 
00581           /* Check the element size and sign */
00582           
00583         if (bits < 0 || bits > 64)
00584         
00585           cbf_errornez (CBF_FORMAT, val)
00586         
00587         if (bits == 0)
00588         
00589           bits = 32;
00590           
00591         if (sign == -1)
00592         
00593           sign = 1;
00594 
00595 
00596           /* Add a connection */
00597           
00598         cbf_errornez (cbf_add_fileconnection (&file, NULL), val)
00599         
00600         
00601           /* Code the id, file, position, size and digest */
00602           
00603         if (!cbf_is_base64digest (digest))
00604         
00605           strcpy (digest, "------------------------");
00606           
00607         sprintf (out_line, "%x %p %lx %lx %d %s %x %d %u", 
00608                             id, file, position, size, checked_digest, 
00609                             digest, bits, sign, compression);
00610         
00611         if (encoding == ENC_NONE)
00612         
00613           errorcode = cbf_return_text (BINARY, val, out_line, 
00614                                                       CBF_TOKEN_BIN);
00615           
00616         else
00617         
00618           errorcode = cbf_return_text (BINARY, val, out_line, 
00619                                                       CBF_TOKEN_MIME_BIN);
00620 
00621         if (errorcode == ERROR)
00622         
00623           val->errorcode |= cbf_delete_fileconnection (&file);
00624           
00625         return errorcode;
00626       }
00627     }
00628 
00629 
00630       /* Add the character to the text */
00631       
00632     errorcode = cbf_save_character (file, c);
00633     
00634     cbf_errornez (errorcode, val);
00635   }
00636   while (c != EOF);
00637   
00638   return 0;
00639 }
00640 
00641 
00642 #ifdef __cplusplus
00643 
00644 }
00645 
00646 #endif