let length = Uuseg_string.fold_utf_8 `Grapheme_cluster (fun x _ -> x + 1) 0 let capitalize s = let dec = Uutf.decoder ~encoding:`UTF_8 (`String s) in let rec split_loop ?(acc = []) () = match Uutf.decode dec with | `Await -> assert false | `End -> List.rev acc | `Malformed _ignored -> split_loop ~acc () | `Uchar c -> split_loop ~acc:(c :: acc) () in let buf = Buffer.create 1024 in let enc = Uutf.encoder `UTF_8 (`Buffer buf) in let rec capital_loop ?(last_was_upper = false) xs = match xs with | c :: tl -> let last_was_upper = if Uucp.Alpha.is_alphabetic c then begin let f = if last_was_upper = false then Uucp.Case.Map.to_upper else Uucp.Case.Map.to_lower in match f c with | `Self -> let () = Uutf.encode enc (`Uchar c) |> ignore in true | `Uchars u_lst -> List.iter (fun u -> Uutf.encode enc (`Uchar u) |> ignore) u_lst; true end else let () = Uutf.encode enc (`Uchar c) |> ignore in false in capital_loop ~last_was_upper tl | [] -> let () = Uutf.encode enc `End |> ignore in Buffer.contents buf in split_loop () |> capital_loop let lowercase s = let dec = Uutf.decoder ~encoding:`UTF_8 (`String s) in let rec split_loop ?(acc = []) () = match Uutf.decode dec with | `Await -> assert false | `End -> List.rev acc | `Malformed _ignored -> split_loop ~acc () | `Uchar c -> split_loop ~acc:(c :: acc) () in let buf = Buffer.create 1024 in let enc = Uutf.encoder `UTF_8 (`Buffer buf) in let rec to_lower xs = match xs with | c :: tl -> if Uucp.Alpha.is_alphabetic c then begin match Uucp.Case.Map.to_lower c with | `Self -> let () = Uutf.encode enc (`Uchar c) |> ignore in to_lower tl | `Uchars u_lst -> List.iter (fun u -> Uutf.encode enc (`Uchar u) |> ignore) u_lst; to_lower tl end else let () = Uutf.encode enc (`Uchar c) |> ignore in to_lower tl | [] -> let () = Uutf.encode enc `End |> ignore in Buffer.contents buf in split_loop () |> to_lower let remove_non_alphabetic s = let dec = Uutf.decoder ~encoding:`UTF_8 (`String s) in let rec split_loop ?(acc = []) () = match Uutf.decode dec with | `Await -> assert false | `End -> List.rev acc | `Malformed _ignored -> split_loop ~acc () | `Uchar c -> split_loop ~acc:(c :: acc) () in let buf = Buffer.create 1024 in let enc = Uutf.encoder `UTF_8 (`Buffer buf) in let rec filter_loop xs = match xs with | c :: tl -> if Uucp.Alpha.is_alphabetic c then begin let () = Uutf.encode enc (`Uchar c) |> ignore in filter_loop tl end else filter_loop tl | [] -> let () = Uutf.encode enc `End |> ignore in Buffer.contents buf in split_loop () |> filter_loop let split_in_chunks_of n s = let last, chunks = Uuseg_string.fold_utf_8 `Grapheme_cluster (fun (last, chunks) grapheme -> let l = List.length last in if l < n then (grapheme :: last, chunks) else if l = n then ([grapheme], (List.rev last |> StringLabels.concat ~sep:"") :: chunks) else assert false) ([], []) s in (List.rev last |> StringLabels.concat ~sep:"") :: chunks |> List.rev let utf8_clamp_at n s = let first = Uuseg_string.fold_utf_8 `Grapheme_cluster (fun acc grapheme -> if List.length acc < n then grapheme :: acc else acc) [] s in let first = String.concat "" (List.rev first) in let l = String.length first in let rest = String.sub s l (String.length s - l) in (first, rest) let clamp_at_space_up_to n s = let module S = StringLabels in let module L = ListLabels in let words = S.split_on_char ~sep:' ' s |> L.map ~f:S.trim |> L.filter ~f:(( <> ) "") in let words = match words with | first :: rest -> let l_fst = length first in if l_fst <= n then first :: rest else (* Prima parola troppo lunga, forza lo split anche se non รจ sullo spazio *) let fst, snd = utf8_clamp_at n first in fst :: snd :: rest | [] -> [] in let rec loop acc words = match words with | hd :: tl -> let l = length hd in if l <= n then loop (hd :: acc) tl else let words' = split_in_chunks_of n hd in loop (L.rev words' @ acc) tl | [] -> L.rev acc in let words = loop [] words in let rec loop ?(ok = []) ?(total_chars = 0) ?(total_words = 0) words = match words with | hd :: tl -> let l = length hd in if total_chars + total_words + l > n then (L.rev ok |> S.concat ~sep:" ", S.concat ~sep:" " words) else loop ~ok:(hd :: ok) ~total_chars:(total_chars + l) ~total_words:(total_words + 1) tl | [] -> (L.rev ok |> S.concat ~sep:" ", "") in loop words let split_at_space_up_to n s = let rec loop ?(acc = []) s = let s', rest = clamp_at_space_up_to n s in let acc = s' :: acc in if rest = "" then List.rev acc else loop ~acc rest in loop s let recode_string ?(encoding = `UTF_8) src = let dst = Buffer.create 4 in let rec loop d e = match Uutf.decode d with | `Uchar _ as u -> let (_ : [`Ok | `Partial]) = Uutf.encode e u in loop d e | `End -> let (_ : [`Ok | `Partial]) = Uutf.encode e `End in () | `Malformed _ -> let (_ : [`Ok | `Partial]) = Uutf.encode e (`Uchar Uutf.u_rep) in loop d e | `Await -> assert false in let d = Uutf.decoder ~nln:(`NLF (Uchar.of_int 10)) ~encoding (`String src) in let e = Uutf.encoder `UTF_8 (`Buffer dst) in let () = loop d e in Buffer.contents dst