ical2rem/lib/utf8.ml

let length = Uuseg_string.fold_utf_8 `Grapheme_cluster (fun x _ -> x + 1) 0

let capitalize s =
  let dec = Uutf.decoder ~encoding:`UTF_8 (`String s) in

  let rec split_loop ?(acc = []) () =
    match Uutf.decode dec with
    | `Await -> assert false
    | `End -> List.rev acc
    | `Malformed _ignored -> split_loop ~acc ()
    | `Uchar c -> split_loop ~acc:(c :: acc) ()
  in

  let buf = Buffer.create 1024 in
  let enc = Uutf.encoder `UTF_8 (`Buffer buf) in
  let rec capital_loop ?(last_was_upper = false) xs =
    match xs with
    | c :: tl ->
      let last_was_upper =
        if Uucp.Alpha.is_alphabetic c
        then begin
          let f = if last_was_upper = false then Uucp.Case.Map.to_upper else Uucp.Case.Map.to_lower in
          match f c with
          | `Self ->
            let () = Uutf.encode enc (`Uchar c) |> ignore in
            true
          | `Uchars u_lst ->
            List.iter (fun u -> Uutf.encode enc (`Uchar u) |> ignore) u_lst;
            true
        end
        else
          let () = Uutf.encode enc (`Uchar c) |> ignore in
          false
      in
      capital_loop ~last_was_upper tl
    | [] ->
      let () = Uutf.encode enc `End |> ignore in
      Buffer.contents buf
  in
  split_loop () |> capital_loop

let lowercase s =
  let dec = Uutf.decoder ~encoding:`UTF_8 (`String s) in

  let rec split_loop ?(acc = []) () =
    match Uutf.decode dec with
    | `Await -> assert false
    | `End -> List.rev acc
    | `Malformed _ignored -> split_loop ~acc ()
    | `Uchar c -> split_loop ~acc:(c :: acc) ()
  in

  let buf = Buffer.create 1024 in
  let enc = Uutf.encoder `UTF_8 (`Buffer buf) in
  let rec to_lower xs =
    match xs with
    | c :: tl ->
      if Uucp.Alpha.is_alphabetic c
      then begin
        match Uucp.Case.Map.to_lower c with
        | `Self ->
          let () = Uutf.encode enc (`Uchar c) |> ignore in
          to_lower tl
        | `Uchars u_lst ->
          List.iter (fun u -> Uutf.encode enc (`Uchar u) |> ignore) u_lst;
          to_lower tl
      end
      else
        let () = Uutf.encode enc (`Uchar c) |> ignore in
        to_lower tl
    | [] ->
      let () = Uutf.encode enc `End |> ignore in
      Buffer.contents buf
  in
  split_loop () |> to_lower

let remove_non_alphabetic s =
  let dec = Uutf.decoder ~encoding:`UTF_8 (`String s) in

  let rec split_loop ?(acc = []) () =
    match Uutf.decode dec with
    | `Await -> assert false
    | `End -> List.rev acc
    | `Malformed _ignored -> split_loop ~acc ()
    | `Uchar c -> split_loop ~acc:(c :: acc) ()
  in

  let buf = Buffer.create 1024 in
  let enc = Uutf.encoder `UTF_8 (`Buffer buf) in
  let rec filter_loop xs =
    match xs with
    | c :: tl ->
      if Uucp.Alpha.is_alphabetic c
      then begin
        let () = Uutf.encode enc (`Uchar c) |> ignore in
        filter_loop tl
      end
      else filter_loop tl
    | [] ->
      let () = Uutf.encode enc `End |> ignore in
      Buffer.contents buf
  in
  split_loop () |> filter_loop

let split_in_chunks_of n s =
  let last, chunks =
    Uuseg_string.fold_utf_8
      `Grapheme_cluster
      (fun (last, chunks) grapheme ->
        let l = List.length last in
        if l < n
        then (grapheme :: last, chunks)
        else if l = n
        then ([grapheme], (List.rev last |> StringLabels.concat ~sep:"") :: chunks)
        else assert false)
      ([], [])
      s
  in
  (List.rev last |> StringLabels.concat ~sep:"") :: chunks |> List.rev

let utf8_clamp_at n s =
  let first =
    Uuseg_string.fold_utf_8
      `Grapheme_cluster
      (fun acc grapheme -> if List.length acc < n then grapheme :: acc else acc)
      []
      s
  in
  let first = String.concat "" (List.rev first) in
  let l = String.length first in
  let rest = String.sub s l (String.length s - l) in
  (first, rest)

let clamp_at_space_up_to n s =
  let module S = StringLabels in
  let module L = ListLabels in
  let words = S.split_on_char ~sep:' ' s |> L.map ~f:S.trim |> L.filter ~f:(( <> ) "") in

  let words =
    match words with
    | first :: rest ->
      let l_fst = length first in
      if l_fst <= n
      then first :: rest
      else
        (* Prima parola troppo lunga, forza lo split anche se non è sullo spazio *)
        let fst, snd = utf8_clamp_at n first in
        fst :: snd :: rest
    | [] -> []
  in

  let rec loop acc words =
    match words with
    | hd :: tl ->
      let l = length hd in
      if l <= n
      then loop (hd :: acc) tl
      else
        let words' = split_in_chunks_of n hd in
        loop (L.rev words' @ acc) tl
    | [] -> L.rev acc
  in
  let words = loop [] words in

  let rec loop ?(ok = []) ?(total_chars = 0) ?(total_words = 0) words =
    match words with
    | hd :: tl ->
      let l = length hd in
      if total_chars + total_words + l > n
      then (L.rev ok |> S.concat ~sep:" ", S.concat ~sep:" " words)
      else loop ~ok:(hd :: ok) ~total_chars:(total_chars + l) ~total_words:(total_words + 1) tl
    | [] -> (L.rev ok |> S.concat ~sep:" ", "")
  in
  loop words

let split_at_space_up_to n s =
  let rec loop ?(acc = []) s =
    let s', rest = clamp_at_space_up_to n s in
    let acc = s' :: acc in
    if rest = "" then List.rev acc else loop ~acc rest
  in
  loop s

let recode_string ?(encoding = `UTF_8) src =
  let dst = Buffer.create 4 in
  let rec loop d e =
    match Uutf.decode d with
    | `Uchar _ as u ->
      let (_ : [`Ok | `Partial]) = Uutf.encode e u in
      loop d e
    | `End ->
      let (_ : [`Ok | `Partial]) = Uutf.encode e `End in
      ()
    | `Malformed _ ->
      let (_ : [`Ok | `Partial]) = Uutf.encode e (`Uchar Uutf.u_rep) in
      loop d e
    | `Await -> assert false
  in
  let d = Uutf.decoder ~nln:(`NLF (Uchar.of_int 10)) ~encoding (`String src) in
  let e = Uutf.encoder `UTF_8 (`Buffer dst) in
  let () = loop d e in
  Buffer.contents dst