open Unix;;
open Str;;
open Int32;;

(*
A preprocessed_file_buf structure contains the results of reading in
a single file and then expanding all includes and macros using the
cpp macro preprocessor.

Supported operations:
  - given a filename, read the file, run cpp, and create a
    preprocessed_file_buf
  - create a lexbuf
  - map the a position returned by lexeme_start to a position record
*)

(*
Internally, a preprocessed_file_buf is represented as a big string,
which consists of the original file with the # line number directives removed.

In addition, there is a table mapping integer positions in the big string
to position records.  We store one pos record for each line, plus one position
record marking the end of the file.  This could
be more efficient, but then again, it could be worse.
*)

type position = {filename:string; line:int; column:int};;

type preprocessed_file_buf =
{
  pp_buf: string;
  pp_pos_list: (int * position) list;
};;

exception CPPOutputException of string;;

(*
Read input one line at a time until there are no more lines.
Build a (reversed) list of all lines, as well as a list of position records.

   
Here, we make the assumptions that

a: any line in the preprocessor output beggining with '#' is a linemarker (see below).
In particular, we assume that no pre-processor directives have been ignored.

b: everytime source code is included from a different file, we are supplied with 
a line marker to tell us about it.

c: finally, we assume that the preprocessor preserves column offsets and linebreaks. 

d: no filenames have spaces in them

The following is taken from documentation found at
http://gcc.gnu.org/onlinedocs/cpp/Preprocessor-Output.html#Preprocessor%20Output


Source file name and line number information is conveyed by lines of the form

# linenum filename flags
  
  These are called linemarkers. They are inserted as needed into the output 
  (but never within a string or character constant). They mean that the following 
  line originated in file filename at line linenum. filename will never contain any 
  non-printing characters; they are replaced with octal escape sequences.
  
  After the file name comes zero or more flags, which are 1, 2, 3, or 4. 
  If there are multiple flags, spaces separate them. Here is what the flags mean:
  
  1 This indicates the start of a new file.
  2 This indicates returning to a file (after having included another file).
  3 This indicates that the following text comes from a system header file, so certain warnings should be suppressed.
  4 This indicates that the following text should be treated as being wrapped in an implicit extern "C" block.  
*)


let preprocessed_file_buf_from_in_channel (ic:in_channel) (file_name:string): preprocessed_file_buf =
  let buf_list  = ref [] in
  let pos_list  = ref [] in
  let file_stack = ref [] in
  (* offset within the pp_buf of the start of the next line *)
  let next_offset = ref 0 in
  (* position of the start of the next line *)
  let next_pos = ref {filename = file_name; line = 1; column = 1;} in
  let parse_linemarker (line:string):(int*string) = 
    match Str.split (regexp "[ \n]+") line with
    | a::b::c::d ->           (* Note that this is a space split...filenames with spaces will break this *)
      let linenum = try (
          (Int32.to_int (Int32.of_string b))
        ) with err -> raise (CPPOutputException ("could not split parse " ^b^ " as integer ")) in
      (linenum, c)  
    | _ -> raise (CPPOutputException ("could not split " ^ line ^ " from cpp")) in
  try
  (
    let rec loop () =     
      let line = (input_line ic) ^ "\n" in
      if (String.get line 0) = '#' then
      (
        (* a '#' at the beggining of a line is a line number directive *)
	 let (linenum, included_filename) = (parse_linemarker line) in
             (*
                right now, we are not adding this '#' line to the buffer and 
                it's position to the position list as if it were part of the including
                file.  I am not sure if this is the correct approach.   
              *)
        next_pos := {filename = included_filename; line = linenum; column = 1;}
      )
      else
      (
        (* an ordinary line *)
        pos_list := (!next_offset, !next_pos)::!pos_list;
        buf_list := line::!buf_list;
        next_pos := {!next_pos with line = !next_pos.line + 1};
	next_offset := !next_offset + (String.length line)
      );
      loop ()
    in loop ()
  ) with End_of_file ->
    {
      pp_buf = String.concat "" (List.rev !buf_list);
      pp_pos_list = List.rev !pos_list;
    }
;;

let preprocessed_file_buf_from_filename (filename:string): preprocessed_file_buf =
  (* this would be a good spot to call cpp, huh?  *)
  let ic = 
    try (Unix.open_process_in ("cpp " ^ filename)) 
    with err -> raise (CPPOutputException "Could not run cpp") in
  preprocessed_file_buf_from_in_channel ic filename
;;

let lexbuf_from_preprocessed_file_buf (pp:preprocessed_file_buf):Lexing.lexbuf =
  Lexing.from_string pp.pp_buf
;;

let pos_from_offset (pp:preprocessed_file_buf) (offset:int):position =
  let make_pos ((line_start_offset:int), (line_start_pos:position)):position =
    {
      filename = line_start_pos.filename;
      line = line_start_pos.line;
      column = 1 + (offset - line_start_offset);
    } in
  let rec f (prev:int * position) (pos_list:(int * position) list):position =
    match pos_list with
    | [] -> make_pos prev
    | (l_offset, l_pos)::t ->
        if l_offset <= offset then f (l_offset, l_pos) t
        else make_pos prev
  in f (List.hd pp.pp_pos_list) pp.pp_pos_list
;;

