NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
open_trace_file.c
Go to the documentation of this file.
1 /*
2 Author: James Bonfield
3 
4 Copyright (c) 2000-2001 MEDICAL RESEARCH COUNCIL
5 All rights reserved
6 
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are met:
9 
10  1. Redistributions of source code must retain the above copyright notice,
11 this list of conditions and the following disclaimer.
12 
13  2. Redistributions in binary form must reproduce the above copyright notice,
14 this list of conditions and the following disclaimer in the documentation
15 and/or other materials provided with the distribution.
16 
17  3. Neither the name of the MEDICAL RESEARCH COUNCIL, THE LABORATORY OF
18 MOLECULAR BIOLOGY nor the names of its contributors may be used to endorse or
19 promote products derived from this software without specific prior written
20 permission.
21 
22 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
23 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
24 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
26 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
27 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
29 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33 
34 /*
35 Copyright (c) 2008, 2009, 2013 Genome Research Ltd.
36 Author: James Bonfield <jkb@sanger.ac.uk>
37 
38 Redistribution and use in source and binary forms, with or without
39 modification, are permitted provided that the following conditions are met:
40 
41  1. Redistributions of source code must retain the above copyright notice,
42 this list of conditions and the following disclaimer.
43 
44  2. Redistributions in binary form must reproduce the above copyright notice,
45 this list of conditions and the following disclaimer in the documentation
46 and/or other materials provided with the distribution.
47 
48  3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
49 Institute nor the names of its contributors may be used to endorse or promote
50 products derived from this software without specific prior written permission.
51 
52 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
53 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
54 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
55 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
56 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
58 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
59 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
60 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62 */
63 
64 #include <stdlib.h>
65 #include <stdio.h>
66 #include <string.h>
67 #include <unistd.h>
68 #include <ctype.h>
69 #include <limits.h>
70 #include <sys/types.h>
71 #include <sys/stat.h>
72 #include "cram/os.h"
73 #ifndef PATH_MAX
74 # define PATH_MAX 1024
75 #endif
76 #ifdef HAVE_LIBCURL
77 # include <curl/curl.h>
78 #endif
79 
80 #include "cram/open_trace_file.h"
81 #include "cram/misc.h"
82 
83 /*
84  * Tokenises the search path splitting on colons (unix) or semicolons
85  * (windows).
86  * We also explicitly add a "./" to the end of the search path
87  *
88  * Returns: A new search path with items separated by nul chars. Two nul
89  * chars in a row represent the end of the tokenised path.
90  * Returns NULL for a failure.
91  *
92  * The returned data has been malloced. It is up to the caller to free this
93  * memory.
94  */
95 char *tokenise_search_path(char *searchpath) {
96  char *newsearch;
97  unsigned int i, j;
98  size_t len;
99 #ifdef _WIN32
100  char path_sep = ';';
101 #else
102  char path_sep = ':';
103 #endif
104 
105  if (!searchpath)
106  searchpath="";
107 
108  newsearch = (char *)malloc((len = strlen(searchpath))+5);
109  if (!newsearch)
110  return NULL;
111 
112  for (i = 0, j = 0; i < len; i++) {
113  /* "::" => ":". Used for escaping colons in http://foo */
114  if (i < len-1 && searchpath[i] == ':' && searchpath[i+1] == ':') {
115  newsearch[j++] = ':';
116  i++;
117  continue;
118  }
119 
120  if (searchpath[i] == path_sep) {
121  /* Skip blank path components */
122  if (j && newsearch[j-1] != 0)
123  newsearch[j++] = 0;
124  } else {
125  newsearch[j++] = searchpath[i];
126  }
127  }
128 
129  if (j)
130  newsearch[j++] = 0;
131  newsearch[j++] = '.';
132  newsearch[j++] = '/';
133  newsearch[j++] = 0;
134  newsearch[j++] = 0;
135 
136  return newsearch;
137 }
138 
139 #ifdef HAVE_LIBCURL
140 mFILE *find_file_url(char *file, char *url) {
141  char buf[8192], *cp;
142  mFILE *mf = NULL, *headers = NULL;
143  int maxlen = 8190 - strlen(file);
144  static CURL *handle = NULL;
145  static int curl_init = 0;
146  char errbuf[CURL_ERROR_SIZE];
147 
148  *errbuf = 0;
149 
150  if (!curl_init) {
151  if (curl_global_init(CURL_GLOBAL_ALL))
152  return NULL;
153 
154  if (NULL == (handle = curl_easy_init()))
155  goto error;
156 
157  curl_init = 1;
158  }
159 
160  /* Expand %s for the trace name */
161  for (cp = buf; *url && cp - buf < maxlen; url++) {
162  if (*url == '%' && *(url+1) == 's') {
163  url++;
164  cp += strlen(strcpy(cp, file));
165  } else {
166  *cp++ = *url;
167  }
168  }
169  *cp++ = 0;
170 
171  /* Setup the curl */
172  if (NULL == (mf = mfcreate(NULL, 0)) ||
173  NULL == (headers = mfcreate(NULL, 0)))
174  return NULL;
175 
176  if (0 != curl_easy_setopt(handle, CURLOPT_URL, buf))
177  goto error;
178  if (0 != curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT, 60L))
179  goto error;
180  if (0 != curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION,
181  (curl_write_callback)mfwrite))
182  goto error;
183  if (0 != curl_easy_setopt(handle, CURLOPT_WRITEDATA, mf))
184  goto error;
185  if (0 != curl_easy_setopt(handle, CURLOPT_HEADERFUNCTION,
186  (curl_write_callback)mfwrite))
187  goto error;
188  if (0 != curl_easy_setopt(handle, CURLOPT_WRITEHEADER, headers))
189  goto error;
190  if (0 != curl_easy_setopt(handle, CURLOPT_ERRORBUFFER, errbuf))
191  goto error;
192 
193  /* Fetch! */
194  if (0 != curl_easy_perform(handle))
195  goto error;
196 
197  /* Report errors is approproate. 404 is silent as it may have just been
198  * a search via RAWDATA path, everything else is worth reporting.
199  */
200  {
201  float version;
202  int response;
203  char nul = 0;
204  mfwrite(&nul, 1, 1, headers);
205  if (2 == sscanf(headers->data, "HTTP/%f %d", &version, &response)) {
206  if (response != 200) {
207  if (response != 404)
208  fprintf(stderr, "%.*s\n",
209  (int)headers->size, headers->data);
210  goto error;
211  }
212  }
213  }
214 
215  if (mftell(mf) == 0)
216  goto error;
217 
218  mfdestroy(headers);
219 
220  mrewind(mf);
221  return mf;
222 
223  error:
224  if (mf)
225  mfdestroy(mf);
226  if (headers)
227  mfdestroy(headers);
228  if (*errbuf)
229  fprintf(stderr, "%s\n", errbuf);
230  return NULL;
231 }
232 #endif
233 
234 /*
235  * Searches for file in the directory 'dirname'. If it finds it, it opens
236  * it. This also searches for compressed versions of the file in dirname
237  * too.
238  *
239  * Returns mFILE pointer if found
240  * NULL if not
241  */
242 static mFILE *find_file_dir(char *file, char *dirname) {
243  char path[PATH_MAX+1];
244  size_t len = strlen(dirname);
245  char *cp;
246 
247  if (dirname[len-1] == '/')
248  len--;
249 
250  /* Special case for "./" or absolute filenames */
251  if (*file == '/' || (len==1 && *dirname == '.')) {
252  sprintf(path, "%s", file);
253  } else {
254  /* Handle %[0-9]*s expansions, if required */
255  char *path_end = path;
256  *path = 0;
257  while ((cp = strchr(dirname, '%'))) {
258  char *endp;
259  long l = strtol(cp+1, &endp, 10);
260  if (*endp != 's') {
261  strncpy(path_end, dirname, (endp+1)-dirname);
262  path_end += (endp+1)-dirname;
263  dirname = endp+1;
264  continue;
265  }
266 
267  strncpy(path_end, dirname, cp-dirname);
268  path_end += cp-dirname;
269  if (l) {
270  strncpy(path_end, file, l);
271  path_end += MIN(strlen(file), l);
272  file += MIN(strlen(file), l);
273  } else {
274  strcpy(path_end, file);
275  path_end += strlen(file);
276  file += strlen(file);
277  }
278  len -= (endp+1) - dirname;
279  dirname = endp+1;
280  }
281  strncpy(path_end, dirname, len);
282  path_end += MIN(strlen(dirname), len);
283  *path_end = 0;
284  if (*file) {
285  *path_end++ = '/';
286  strcpy(path_end, file);
287  }
288 
289  //fprintf(stderr, "*PATH=\"%s\"\n", path);
290  }
291 
292  if (is_file(path)) {
293  return mfopen(path, "rb");
294  }
295 
296  return NULL;
297 }
298 
299 /*
300  * ------------------------------------------------------------------------
301  * Public functions below.
302  */
303 
304 /*
305  * Opens a trace file named 'file'. This is initially looked for as a
306  * pathname relative to a file named "relative_to". This may (for
307  * example) be the name of an experiment file referencing the trace
308  * file. In this case by passing relative_to as the experiment file
309  * filename the trace file will be picked up in the same directory as
310  * the experiment file. Relative_to may be supplied as NULL.
311  *
312  * 'file' is looked for at relative_to, then the current directory, and then
313  * all of the locations listed in 'path' (which is a colon separated list).
314  * If 'path' is NULL it uses the RAWDATA environment variable instead.
315  *
316  * Returns a mFILE pointer when found.
317  * NULL otherwise.
318  */
319 mFILE *open_path_mfile(char *file, char *path, char *relative_to) {
320  char *newsearch;
321  char *ele;
322  mFILE *fp;
323 
324  /* Use path first */
325  if (!path)
326  path = getenv("RAWDATA");
327  if (NULL == (newsearch = tokenise_search_path(path)))
328  return NULL;
329 
330  /*
331  * Step through the search path testing out each component.
332  * We now look through each path element treating some prefixes as
333  * special, otherwise we treat the element as a directory.
334  */
335  for (ele = newsearch; *ele; ele += strlen(ele)+1) {
336  int i;
337  char *suffix[6] = {"", ".gz", ".bz2", ".sz", ".Z", ".bz2"};
338  for (i = 0; i < 6; i++) {
339  char file2[1024];
340  char *ele2;
341  int valid = 1;
342 
343  /*
344  * '|' prefixing a path component indicates that we do not
345  * wish to perform the compression extension searching in that
346  * location.
347  */
348  if (*ele == '|') {
349  ele2 = ele+1;
350  valid = (i == 0);
351  } else {
352  ele2 = ele;
353  }
354 
355  sprintf(file2, "%s%s", file, suffix[i]);
356 
357 #if defined(HAVE_LIBCURL)
358  if (0 == strncmp(ele2, "URL=", 4)) {
359  if (valid && (fp = find_file_url(file2, ele2+4))) {
360  free(newsearch);
361  return fp;
362  }
363  } else
364 #endif
365  if (valid && (fp = find_file_dir(file2, ele2))) {
366  free(newsearch);
367  return fp;
368  }
369  }
370  }
371 
372  free(newsearch);
373 
374  /* Look in the same location as the incoming 'relative_to' filename */
375  if (relative_to) {
376  char *cp;
377  char relative_path[PATH_MAX+1];
378  strcpy(relative_path, relative_to);
379  if ((cp = strrchr(relative_path, '/')))
380  *cp = 0;
381  if ((fp = find_file_dir(file, relative_path)))
382  return fp;
383  }
384 
385  return NULL;
386 }