203 lines
5.7 KiB
OCaml
203 lines
5.7 KiB
OCaml
let length = Uuseg_string.fold_utf_8 `Grapheme_cluster (fun x _ -> x + 1) 0
|
|
|
|
let capitalize s =
|
|
let dec = Uutf.decoder ~encoding:`UTF_8 (`String s) in
|
|
|
|
let rec split_loop ?(acc = []) () =
|
|
match Uutf.decode dec with
|
|
| `Await -> assert false
|
|
| `End -> List.rev acc
|
|
| `Malformed _ignored -> split_loop ~acc ()
|
|
| `Uchar c -> split_loop ~acc:(c :: acc) ()
|
|
in
|
|
|
|
let buf = Buffer.create 1024 in
|
|
let enc = Uutf.encoder `UTF_8 (`Buffer buf) in
|
|
let rec capital_loop ?(last_was_upper = false) xs =
|
|
match xs with
|
|
| c :: tl ->
|
|
let last_was_upper =
|
|
if Uucp.Alpha.is_alphabetic c
|
|
then begin
|
|
let f = if last_was_upper = false then Uucp.Case.Map.to_upper else Uucp.Case.Map.to_lower in
|
|
match f c with
|
|
| `Self ->
|
|
let () = Uutf.encode enc (`Uchar c) |> ignore in
|
|
true
|
|
| `Uchars u_lst ->
|
|
List.iter (fun u -> Uutf.encode enc (`Uchar u) |> ignore) u_lst;
|
|
true
|
|
end
|
|
else
|
|
let () = Uutf.encode enc (`Uchar c) |> ignore in
|
|
false
|
|
in
|
|
capital_loop ~last_was_upper tl
|
|
| [] ->
|
|
let () = Uutf.encode enc `End |> ignore in
|
|
Buffer.contents buf
|
|
in
|
|
split_loop () |> capital_loop
|
|
|
|
let lowercase s =
|
|
let dec = Uutf.decoder ~encoding:`UTF_8 (`String s) in
|
|
|
|
let rec split_loop ?(acc = []) () =
|
|
match Uutf.decode dec with
|
|
| `Await -> assert false
|
|
| `End -> List.rev acc
|
|
| `Malformed _ignored -> split_loop ~acc ()
|
|
| `Uchar c -> split_loop ~acc:(c :: acc) ()
|
|
in
|
|
|
|
let buf = Buffer.create 1024 in
|
|
let enc = Uutf.encoder `UTF_8 (`Buffer buf) in
|
|
let rec to_lower xs =
|
|
match xs with
|
|
| c :: tl ->
|
|
if Uucp.Alpha.is_alphabetic c
|
|
then begin
|
|
match Uucp.Case.Map.to_lower c with
|
|
| `Self ->
|
|
let () = Uutf.encode enc (`Uchar c) |> ignore in
|
|
to_lower tl
|
|
| `Uchars u_lst ->
|
|
List.iter (fun u -> Uutf.encode enc (`Uchar u) |> ignore) u_lst;
|
|
to_lower tl
|
|
end
|
|
else
|
|
let () = Uutf.encode enc (`Uchar c) |> ignore in
|
|
to_lower tl
|
|
| [] ->
|
|
let () = Uutf.encode enc `End |> ignore in
|
|
Buffer.contents buf
|
|
in
|
|
split_loop () |> to_lower
|
|
|
|
let remove_non_alphabetic s =
|
|
let dec = Uutf.decoder ~encoding:`UTF_8 (`String s) in
|
|
|
|
let rec split_loop ?(acc = []) () =
|
|
match Uutf.decode dec with
|
|
| `Await -> assert false
|
|
| `End -> List.rev acc
|
|
| `Malformed _ignored -> split_loop ~acc ()
|
|
| `Uchar c -> split_loop ~acc:(c :: acc) ()
|
|
in
|
|
|
|
let buf = Buffer.create 1024 in
|
|
let enc = Uutf.encoder `UTF_8 (`Buffer buf) in
|
|
let rec filter_loop xs =
|
|
match xs with
|
|
| c :: tl ->
|
|
if Uucp.Alpha.is_alphabetic c
|
|
then begin
|
|
let () = Uutf.encode enc (`Uchar c) |> ignore in
|
|
filter_loop tl
|
|
end
|
|
else filter_loop tl
|
|
| [] ->
|
|
let () = Uutf.encode enc `End |> ignore in
|
|
Buffer.contents buf
|
|
in
|
|
split_loop () |> filter_loop
|
|
|
|
let split_in_chunks_of n s =
|
|
let last, chunks =
|
|
Uuseg_string.fold_utf_8
|
|
`Grapheme_cluster
|
|
(fun (last, chunks) grapheme ->
|
|
let l = List.length last in
|
|
if l < n
|
|
then (grapheme :: last, chunks)
|
|
else if l = n
|
|
then ([grapheme], (List.rev last |> StringLabels.concat ~sep:"") :: chunks)
|
|
else assert false)
|
|
([], [])
|
|
s
|
|
in
|
|
(List.rev last |> StringLabels.concat ~sep:"") :: chunks |> List.rev
|
|
|
|
let utf8_clamp_at n s =
|
|
let first =
|
|
Uuseg_string.fold_utf_8
|
|
`Grapheme_cluster
|
|
(fun acc grapheme -> if List.length acc < n then grapheme :: acc else acc)
|
|
[]
|
|
s
|
|
in
|
|
let first = String.concat "" (List.rev first) in
|
|
let l = String.length first in
|
|
let rest = String.sub s l (String.length s - l) in
|
|
(first, rest)
|
|
|
|
let clamp_at_space_up_to n s =
|
|
let module S = StringLabels in
|
|
let module L = ListLabels in
|
|
let words = S.split_on_char ~sep:' ' s |> L.map ~f:S.trim |> L.filter ~f:(( <> ) "") in
|
|
|
|
let words =
|
|
match words with
|
|
| first :: rest ->
|
|
let l_fst = length first in
|
|
if l_fst <= n
|
|
then first :: rest
|
|
else
|
|
(* Prima parola troppo lunga, forza lo split anche se non è sullo spazio *)
|
|
let fst, snd = utf8_clamp_at n first in
|
|
fst :: snd :: rest
|
|
| [] -> []
|
|
in
|
|
|
|
let rec loop acc words =
|
|
match words with
|
|
| hd :: tl ->
|
|
let l = length hd in
|
|
if l <= n
|
|
then loop (hd :: acc) tl
|
|
else
|
|
let words' = split_in_chunks_of n hd in
|
|
loop (L.rev words' @ acc) tl
|
|
| [] -> L.rev acc
|
|
in
|
|
let words = loop [] words in
|
|
|
|
let rec loop ?(ok = []) ?(total_chars = 0) ?(total_words = 0) words =
|
|
match words with
|
|
| hd :: tl ->
|
|
let l = length hd in
|
|
if total_chars + total_words + l > n
|
|
then (L.rev ok |> S.concat ~sep:" ", S.concat ~sep:" " words)
|
|
else loop ~ok:(hd :: ok) ~total_chars:(total_chars + l) ~total_words:(total_words + 1) tl
|
|
| [] -> (L.rev ok |> S.concat ~sep:" ", "")
|
|
in
|
|
loop words
|
|
|
|
let split_at_space_up_to n s =
|
|
let rec loop ?(acc = []) s =
|
|
let s', rest = clamp_at_space_up_to n s in
|
|
let acc = s' :: acc in
|
|
if rest = "" then List.rev acc else loop ~acc rest
|
|
in
|
|
loop s
|
|
|
|
let recode_string ?(encoding = `UTF_8) src =
|
|
let dst = Buffer.create 4 in
|
|
let rec loop d e =
|
|
match Uutf.decode d with
|
|
| `Uchar _ as u ->
|
|
let (_ : [`Ok | `Partial]) = Uutf.encode e u in
|
|
loop d e
|
|
| `End ->
|
|
let (_ : [`Ok | `Partial]) = Uutf.encode e `End in
|
|
()
|
|
| `Malformed _ ->
|
|
let (_ : [`Ok | `Partial]) = Uutf.encode e (`Uchar Uutf.u_rep) in
|
|
loop d e
|
|
| `Await -> assert false
|
|
in
|
|
let d = Uutf.decoder ~nln:(`NLF (Uchar.of_int 10)) ~encoding (`String src) in
|
|
let e = Uutf.encoder `UTF_8 (`Buffer dst) in
|
|
let () = loop d e in
|
|
Buffer.contents dst
|