/* mecab.c 2013/10/17
   Copyright (c) SOFNEC Co., Ltd.
 */
#include <stdio.h>
#include <string.h>
#ifdef __APPLE__
#include <malloc/malloc.h>
#else
#include <malloc.h>
#endif
#include <mecab.h>
#include "aze_api.h"
#include <azst.h>

#define AZMECAB_VERSION   "1.2"

#define AZMECAB_NEW_MAX_ARG_NUM         30
#define AZMECAB_MAX_PARSE_STRING_LEN  4096
#define FEATURE_STRING_MAX_LEN        1023
#define FEATURE_STRING_MAX_LEN1       1024
#define FEATURE_MAX_NUM                 30

#define UNIFY_MECAB_NODE_FORM_SURFACE    0  /* non list form */
#define UNIFY_MECAB_NODE_FORM_NORMAL     4
#define UNIFY_MECAB_NODE_FORM_FULL      12

#define COMMA_CHAR   0x2c

static BASEINT MECAB_VERSION_ATOM;

static st_table* InstanceTable;


/* mecab_new(-MECAB, +ARGS_LIST) */
static AZE_C_PRED P2_mecab_new(AZE_ENV Env)
{
  char *buf[AZMECAB_NEW_MAX_ARG_NUM];
  int i, n;
  AZE_TERM arg_mecab, arg_args;
  mecab_t *m = 0;

  AZE_PRED_BEGIN(Env);

  arg_mecab = AZE_ARG(2, 0);
  arg_args  = AZE_ARG(2, 1);

  n = aze_atom_list_to_cstring_array(Env, arg_args, buf, AZMECAB_NEW_MAX_ARG_NUM);

  if (n >= 0) {
    m = mecab_new(n, buf);
  }

  for (i = 0; i < AZMECAB_NEW_MAX_ARG_NUM; i++) {
    if (buf[i] != 0) free(buf[i]);
  }

  if (n < 0)     AZE_PRED_FAIL;
  if (m == NULL) AZE_PRED_FAIL;

  if (aze_unify_int(Env, (BASEINT )m, arg_mecab) != 0)
    AZE_PRED_FAIL;

  st_insert(InstanceTable, (st_data_t )m, (st_data_t )0);

  AZE_PRED_DET_SUCC;
}

/* mecab_strerror(+MECAB, -ERROR) */
static AZE_C_PRED P2_mecab_strerror(AZE_ENV Env)
{
  AZE_TERM arg_mecab, arg_str;
  mecab_t *m;
  char *err;

  AZE_PRED_BEGIN(Env);

  arg_mecab = AZE_ARG(2, 0);
  arg_str   = AZE_ARG(2, 1);

  m = (mecab_t* )aze_term_value_int(Env, arg_mecab);

  if (! st_lookup(InstanceTable, (st_data_t )m, (st_data_t* )0)) {
    AZE_PRED_FAIL;
  }

  err = (char* )mecab_strerror(m);
  if (err == NULL) AZE_PRED_FAIL;

  if (aze_unify_atom(Env, aze_cstring_to_atom(Env, err), arg_str) != 0)
    AZE_PRED_FAIL;

  AZE_PRED_DET_SUCC;
}

/* mecab_destroy(+MECAB) */
static AZE_C_PRED P1_mecab_destroy(AZE_ENV Env)
{
  AZE_TERM arg_mecab;
  mecab_t *m;

  AZE_PRED_BEGIN(Env);

  arg_mecab = AZE_ARG(1, 0);
  m = (mecab_t* )aze_term_value_int(Env, arg_mecab);

  if (! st_lookup(InstanceTable, (st_data_t )m, (st_data_t* )0)) {
    AZE_PRED_FAIL;
  }

  if (m != NULL) {
    int r;

    mecab_destroy(m);

    r = st_delete(InstanceTable, (st_data_t* )&m, (st_data_t* )0);
    if (r == 0) {
      AZE_PRED_FAIL;
    }
    AZE_PRED_DET_SUCC;
  }

  AZE_PRED_FAIL;
}

static AZE_ATOM mecab_stat_table[4];
static AZE_ATOM mecab_NOR, mecab_UNK, mecab_BOS, mecab_EOS;

static void init_node_atoms()
{
  mecab_NOR = aze_cstring_to_atom(AZE_ENV_NULL, "MECAB_NOR");
  mecab_UNK = aze_cstring_to_atom(AZE_ENV_NULL, "MECAB_UNK");
  mecab_BOS = aze_cstring_to_atom(AZE_ENV_NULL, "MECAB_BOS");
  mecab_EOS = aze_cstring_to_atom(AZE_ENV_NULL, "MECAB_EOS");

  mecab_stat_table[0] = mecab_NOR;
  mecab_stat_table[1] = mecab_UNK;
  mecab_stat_table[2] = mecab_BOS;
  mecab_stat_table[3] = mecab_EOS;
}

static AZE_ATOM int_to_mecab_node_stat_atom(int stat)
{
  return mecab_stat_table[stat];
}

static int feature_string_to_atom_list(AZE_ENV Env, const char *s, AZE_TERM ut)
{
  unsigned char buf[FEATURE_STRING_MAX_LEN1];
  int len, r, enc;
  unsigned char *p, *q, *end;
  AZE_ATOM a;

  AZE_ATOM fa[FEATURE_MAX_NUM];
  int fn;

  len = strlen(s);
  if (len > FEATURE_STRING_MAX_LEN || len <= 0) return -1;

  strcpy((char* )buf, s);

  enc = az_charsetmode();

  fn = 0;
  end = buf + len;
  p = buf;
  while (p < end) {
    q = p;
    while (q < end && *q != COMMA_CHAR) {
      q += az_enc_len(enc, q);
    }

    if (*q == COMMA_CHAR) *q = 0;

    if (fn >= FEATURE_MAX_NUM) {
      return -3;
    }

    a = aze_cstring2_to_atom(Env, (char* )p, (int )(q - p));
    fa[fn++] = a;
    p = q + 1;
  }

  if (fn == 0) {
    return -2;
  }
  else {
    r = aze_make_prolog_list_from_atom_array(Env, ut, fa, fn);
    if (r != 0) return r;
  }

  return 0;
}

static int unify_mecab_node(AZE_ENV Env, const mecab_node_t *mn
			    , int form_num, AZE_TERM ut)
{
  int r, i, dnum;
  AZE_ATOM surface, stat;
  AZE_TERM feature_term;
  int char_type, isbest, wcost, cost, rcAttr, lcAttr;
  double alpha, beta, prob;
  AZE_C_DATA_TYPE raws[UNIFY_MECAB_NODE_FORM_FULL];

  stat = char_type = isbest = wcost = cost = rcAttr = lcAttr = 0;
  alpha = beta = prob = 0.0;

  if (form_num > UNIFY_MECAB_NODE_FORM_FULL)
    form_num = UNIFY_MECAB_NODE_FORM_FULL;
  else if (form_num < 0)
    form_num = UNIFY_MECAB_NODE_FORM_NORMAL;

  dnum = form_num;
  if (form_num == UNIFY_MECAB_NODE_FORM_SURFACE) dnum = 1;

  r = 0;
  feature_term = (AZE_TERM )NULL;
  surface = aze_cstring2_to_atom(Env, mn->surface, mn->length);
  if (dnum > 1) {
    stat = int_to_mecab_node_stat_atom(mn->stat);
    if (dnum > 2) {
      feature_term = aze_global_cell_new(Env);
      r = feature_string_to_atom_list(Env, mn->feature, feature_term);
      if (r != 0) {
        aze_global_cell_new_cancel(Env);
        return -1;
      }
    }

    char_type =    (int )mn->char_type;
    isbest    =          mn->isbest;
    wcost     =    (int )mn->wcost;
    cost      =    (int )mn->cost;
    alpha     = (double )mn->alpha;
    beta      = (double )mn->beta;
    prob      = (double )mn->prob;
    rcAttr    =    (int )mn->rcAttr;
    lcAttr    =    (int )mn->lcAttr;
  }

  i = 0;
  while (1) {
    AZE_C_DATA_SET_ATOM(raws[i], surface);      /* 1 */
    if (++i >= dnum) break;
    AZE_C_DATA_SET_ATOM(raws[i], stat);         /* 2 */
    if (++i >= dnum) break;
    AZE_C_DATA_SET_TERM(raws[i], feature_term); /* 3 */
    if (++i >= dnum) break;
    AZE_C_DATA_SET_INT(raws[i], char_type);     /* 4 */
    if (++i >= dnum) break;
    AZE_C_DATA_SET_INT(raws[i], isbest);        /* 5 */
    if (++i >= dnum) break;
    AZE_C_DATA_SET_INT(raws[i], wcost);         /* 6 */
    if (++i >= dnum) break;
    AZE_C_DATA_SET_INT(raws[i], cost);          /* 7 */
    if (++i >= dnum) break;
    AZE_C_DATA_SET_DOUBLE(raws[i], alpha);      /* 8 */
    if (++i >= dnum) break;
    AZE_C_DATA_SET_DOUBLE(raws[i], beta);       /* 9 */
    if (++i >= dnum) break;
    AZE_C_DATA_SET_DOUBLE(raws[i], prob);       /* 10 */
    if (++i >= dnum) break;
    AZE_C_DATA_SET_INT(raws[i], rcAttr);        /* 11 */
    if (++i >= dnum) break;
    AZE_C_DATA_SET_INT(raws[i], lcAttr);        /* 12 */

    break;
  }

  if (form_num == UNIFY_MECAB_NODE_FORM_SURFACE)
    r = aze_unify_cell_from_c_data(Env, &(raws[0]), ut);
  else
    r = aze_unify_cell_from_c_data_list(Env, raws, dnum, ut);


  //if (feature_term != NULL)
  //  aze_global_cell_new_cancel(Env);

  return r;
}

static int unify_mecab_node_list(AZE_ENV Env, const mecab_node_t *mnode
				 , int form_num, AZE_TERM ut)
{
  int r;
  const mecab_node_t *mn;
  AZE_TERM head, car;

  mn = mnode;
  head = aze_make_list_init(Env, ut);
  while (mn != NULL) {
    head = aze_make_list_add_item(Env, head, &car);
    if (head == NULL) goto err;
    r = unify_mecab_node(Env, mn, form_num, car);
    if (r != 0) {
    err:
      aze_make_list_error_end(Env);
      return -1;
    }

    mn = mn->next;
  }
  aze_make_list_end(Env, head);

  return 0;
}

/* mecab_sparse_tonode(+MECAB, +SENTENCE, +FORM_NUM, -NODE_LIST) */
static AZE_C_PRED P4_mecab_sparse_tonode(AZE_ENV Env)
{
  int r, form_num, len;
  AZE_TERM arg_mecab, arg_str, arg_node, arg_form_num;
  char *s;
  mecab_t *m;
  const mecab_node_t *node;

  AZE_PRED_BEGIN(Env);

  arg_mecab    = AZE_ARG(4, 0);
  arg_str      = AZE_ARG(4, 1);
  arg_form_num = AZE_ARG(4, 2);
  arg_node     = AZE_ARG(4, 3);

  m = (mecab_t* )aze_term_value_int(Env, arg_mecab);
  if (! st_lookup(InstanceTable, (st_data_t )m, (st_data_t* )0)) {
    AZE_PRED_FAIL;
  }

  form_num = aze_term_value_int(Env, arg_form_num);
  len = az_term_to_cstring_length(Env, arg_str);
  if (len <= 0) AZE_PRED_FAIL;
  s = (char* )malloc((size_t )(len + 1));
  if (s == 0) AZE_PRED_FAIL;
  az_term_to_cstring(Env, arg_str, s, len + 1);

  node = mecab_sparse_tonode(m, s);
  if (node == NULL) {
    free(s);
    AZE_PRED_FAIL;
  }

  r = unify_mecab_node_list(Env, node, form_num, arg_node);
  free(s);
  if (r != 0) AZE_PRED_FAIL;

  AZE_PRED_DET_SUCC;
}

/* mecab_sparse_tostr(+MECAB, +SENTENCE, -RESULT_STRING) */
static AZE_C_PRED P3_mecab_sparse_tostr(AZE_ENV Env)
{
  int r, len;
  AZE_TERM arg_mecab, arg_str, arg_result;
  char *s;
  mecab_t *m;
  const char* rs;

  AZE_PRED_BEGIN(Env);

  arg_mecab    = AZE_ARG(3, 0);
  arg_str      = AZE_ARG(3, 1);
  arg_result   = AZE_ARG(3, 2);

  m = (mecab_t* )aze_term_value_int(Env, arg_mecab);
  if (! st_lookup(InstanceTable, (st_data_t )m, (st_data_t* )0)) {
    AZE_PRED_FAIL;
  }

  len = az_term_to_cstring_length(Env, arg_str);
  if (len <= 0) AZE_PRED_FAIL;
  s = (char* )malloc((size_t )(len + 1));
  if (s == 0) AZE_PRED_FAIL;
  az_term_to_cstring(Env, arg_str, s, len + 1);

  rs = mecab_sparse_tostr(m, s);
  if (rs == 0) {
    free(s);
    AZE_PRED_FAIL;
  }

  r = az_cstring_to_list(Env, rs, (int )strlen(rs), arg_result);
  free(s);
  if (r != 0) AZE_PRED_FAIL;

  AZE_PRED_DET_SUCC;
}

static int   NBestBufSize;
static char* NBestBuf;

/* mecab_nbest_init(+MECAB, +SENTENCE) */
static AZE_C_PRED P2_mecab_nbest_init(AZE_ENV Env)
{
  int r, len;
  AZE_TERM arg_mecab, arg_str;
  mecab_t *m;

  AZE_PRED_BEGIN(Env);

  arg_mecab = AZE_ARG(2, 0);
  arg_str   = AZE_ARG(2, 1);

  m = (mecab_t* )aze_term_value_int(Env, arg_mecab);
  if (! st_lookup(InstanceTable, (st_data_t )m, (st_data_t* )0)) {
    AZE_PRED_FAIL;
  }

  len = az_term_to_cstring_length(Env, arg_str);
  if (len <= 0) AZE_PRED_FAIL;
  if (len > NBestBufSize) {
    if (NBestBuf == 0)
      NBestBuf = (char* )malloc((size_t )(len + 1));
    else
      NBestBuf = (char* )realloc(NBestBuf, (size_t )(len + 1));

    if (NBestBuf == 0) AZE_PRED_FAIL;

    NBestBufSize = len + 1;
  }

  az_term_to_cstring(Env, arg_str, NBestBuf, len + 1);

  r = mecab_nbest_init(m, NBestBuf);
  if (r != 1) {
    AZE_PRED_FAIL;
  }

  AZE_PRED_DET_SUCC;
}

/* mecab_nbest_next_tonode(+MECAB, +FORM_NUM, -NODE_LIST) */
static AZE_C_PRED P3_mecab_nbest_next_tonode(AZE_ENV Env)
{
  int r, form_num;
  AZE_TERM arg_mecab, arg_node, arg_form_num;
  mecab_t *m;
  const mecab_node_t *node;

  AZE_PRED_BEGIN(Env);

  arg_mecab    = AZE_ARG(3, 0);
  arg_form_num = AZE_ARG(3, 1);
  arg_node     = AZE_ARG(3, 2);

  m = (mecab_t* )aze_term_value_int(Env, arg_mecab);
  if (! st_lookup(InstanceTable, (st_data_t )m, (st_data_t* )0)) {
    AZE_PRED_FAIL;
  }

  form_num = aze_term_value_int(Env, arg_form_num);
  node = mecab_nbest_next_tonode(m);
  if (node == NULL) AZE_PRED_FAIL;

  r = unify_mecab_node_list(Env, node, form_num, arg_node);
  if (r != 0) AZE_PRED_FAIL;

  AZE_PRED_DET_SUCC;
}

/* mecab_nbest_end() */
static AZE_C_PRED P0_mecab_nbest_end(AZE_ENV Env)
{
  AZE_PRED_BEGIN(Env);

  AZE_PRED_DET_SUCC;
}

/* ?-mecab_version(-VERSION). */
static pred
P1_mecab_version(Frame *Env)
{
  AZE_PRED_BEGIN(Env);

  if (aze_unify_atom(Env, MECAB_VERSION_ATOM, AZE_ARG(1, 0)) != 0)
    AZE_PRED_FAIL;

  AZE_PRED_DET_SUCC;
}


#ifdef WIN32
__declspec(dllexport) int initiate_mecab(Frame *Env)
#else
extern int initiate_mecab(Frame *Env)
#endif
{
#define DEFP(name,arity,f)   aze_define_pred(Env, name, arity, f)
  char buf[256];

  if (InstanceTable != 0)
    st_free_table(InstanceTable);

  InstanceTable = st_init_numtable();

  init_node_atoms();
  DEFP("mecab_new",               2, P2_mecab_new);
  DEFP("mecab_strerror",          2, P2_mecab_strerror);
  DEFP("mecab_destroy",           1, P1_mecab_destroy);
  DEFP("mecab_sparse_tonode",     4, P4_mecab_sparse_tonode);
  DEFP("mecab_sparse_tostr",      3, P3_mecab_sparse_tostr);
  DEFP("mecab_nbest_init",        2, P2_mecab_nbest_init);
  DEFP("mecab_nbest_next_tonode", 3, P3_mecab_nbest_next_tonode);
  DEFP("mecab_nbest_end",         0, P0_mecab_nbest_end);
  DEFP("mecab_version",           1, P1_mecab_version);

  sprintf(buf, "mecab-ext: %s, mecab: %s", AZMECAB_VERSION, mecab_version());
  MECAB_VERSION_ATOM = PutSystemAtom(Env, buf);

  return 1;
}
